From d372f097f771db1fe6639d255d657ef72da7388b Mon Sep 17 00:00:00 2001
From: Sean Parkinson <sean@wolfssl.com>
Date: Fri, 16 Jul 2021 22:17:20 +1000
Subject: [PATCH] SP C: change number of words for RSA/DH

Faster small code and fast code.
Allow fixed 4096-bit FFDHE parameters in benchmark.
Convert [u]int[32|64|128]*_t types to sp_[u]int[32|64|128].
Add a div for when top bits are all 1
WOLFSSL_SP_FAST_LARGE_CODE added to make mul_add function faster on
non-embedded platforms.
Change mod_exp window sizes for same performance but less memory.
P256 with c32 now 9 words instead of 10.
---
 configure.ac                    |    23 +
 wolfcrypt/benchmark/benchmark.c |    14 +
 wolfcrypt/src/sp_arm32.c        |  1003 +-
 wolfcrypt/src/sp_arm64.c        |  1019 +-
 wolfcrypt/src/sp_armthumb.c     |  1004 +-
 wolfcrypt/src/sp_c32.c          | 32147 ++++++++++++++++++------------
 wolfcrypt/src/sp_c64.c          | 15851 +++++++++++----
 wolfcrypt/src/sp_cortexm.c      |  1003 +-
 wolfcrypt/src/sp_x86_64.c       |   551 +-
 wolfcrypt/src/sp_x86_64_asm.S   |   120 +-
 wolfcrypt/src/sp_x86_64_asm.asm |    40 +-
 wolfssl/wolfcrypt/sp_int.h      |    38 +-
 12 files changed, 34580 insertions(+), 18233 deletions(-)

diff --git a/configure.ac b/configure.ac
index 144c857e4..1fa58e4f6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5238,6 +5238,21 @@ do
     fi
     ;;
 
+  smallfast)
+    ENABLED_SP_SMALL=yes
+    ENABLED_SP_RSA=yes
+    ENABLED_SP_DH=yes
+    ENABLED_SP_FF_2048=yes
+    ENABLED_SP_FF_3072=yes
+    ENABLED_SP_ECC=yes
+    ENABLED_SP_EC_256=yes
+    if test "$host_cpu" = "x86_64"; then
+      ENABLED_SP_FF_4096=yes
+      ENABLED_SP_EC_384=yes
+    fi
+    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_FAST_MODEXP"
+    ;;
+
   yes)
     ENABLED_SP_RSA=yes
     ENABLED_SP_DH=yes
@@ -5390,6 +5405,14 @@ if test "$ENABLED_SP_RSA" = "yes" || test "$ENABLED_SP_DH" = "yes"; then
         AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_4096"
         AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_4096"
     fi
+
+    case $host_cpu in
+    *x86_64* | *aarch64*)
+      AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_LARGE_CODE"
+      ;;
+    *)
+      ;;
+    esac
 fi
 if test "$ENABLED_ECC" != "no" && test "$ENABLED_SP_ECC" = "yes"; then
     ENABLED_SP=yes
diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c
index e31a6f986..ff1dabaac 100644
--- a/wolfcrypt/benchmark/benchmark.c
+++ b/wolfcrypt/benchmark/benchmark.c
@@ -5056,7 +5056,11 @@ exit_bench_rsa_key:
     #endif
 #endif
 
+#ifdef HAVE_FFDHE_4096
+#define BENCH_DH_KEY_SIZE  512 /* for 4096 bit */
+#else
 #define BENCH_DH_KEY_SIZE  384 /* for 3072 bit */
+#endif
 #define BENCH_DH_PRIV_SIZE (BENCH_DH_KEY_SIZE/8)
 
 void bench_dh(int doAsync)
@@ -5127,6 +5131,12 @@ void bench_dh(int doAsync)
         dhKeySz = 3072;
     }
 #endif
+#ifdef HAVE_FFDHE_4096
+    else if (use_ffdhe == 4096) {
+        params = wc_Dh_ffdhe4096_Get();
+        dhKeySz = 4096;
+    }
+#endif
 
     /* clear for done cleanup */
     XMEMSET(dhKey, 0, sizeof(dhKey));
@@ -7005,6 +7015,10 @@ int main(int argc, char** argv)
         else if (string_matches(argv[1], "-ffdhe3072"))
             use_ffdhe = 3072;
 #endif
+#if !defined(NO_DH) && defined(HAVE_FFDHE_4096)
+        else if (string_matches(argv[1], "-ffdhe4096"))
+            use_ffdhe = 4096;
+#endif
 #if defined(HAVE_ECC) && !defined(NO_ECC256)
         else if (string_matches(argv[1], "-p256"))
             bench_asym_algs |= BENCH_ECC_P256;
diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c
index 99d4935c7..fe6b5aa95 100644
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -47,6 +47,17 @@
 #include <wolfssl/wolfcrypt/sp.h>
 
 #ifdef WOLFSSL_SP_ARM32_ASM
+#define SP_PRINT_NUM(var, name, total, words, bits)     \
+    do {                                                \
+        int ii                                          \
+        fprintf(stderr, name "=0x");                    \
+        for (ii = words - 1; ii >= 0; ii--)             \
+            fprintf(stderr, SP_PRINT_FMT, (var)[ii]);   \
+        fprintf(stderr, "\n");                         \
+    } while (0)
+
+#define SP_PRINT_VAL(var, name)                         \
+    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
 /* Read big endian unsigned byte array into r.
@@ -175,7 +186,7 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_2048_to_bin(sp_digit* r, byte* a)
+static void sp_2048_to_bin_64(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -209,6 +220,18 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_64(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_64(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -4239,7 +4262,7 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int32_t sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
+static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = -1;
     sp_digit one = 1;
@@ -6110,7 +6133,7 @@ static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int32_t sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
+static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = -1;
     sp_digit one = 1;
@@ -6979,6 +7002,137 @@ static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 128];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 128), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 128;
+        }
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_64(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
+        if (reduceA != 0) {
+            err = sp_2048_mod_64(t[1] + 64, a, m);
+            if (err == MP_OKAY) {
+                err = sp_2048_mod_64(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
+            err = sp_2048_mod_64(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
+        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
+        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
+        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_2048_mont_sqr_64(r, r, m, mp);
+            sp_2048_mont_sqr_64(r, r, m, mp);
+            sp_2048_mont_sqr_64(r, r, m, mp);
+
+            sp_2048_mont_mul_64(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
+        sp_2048_mont_reduce_64(r, m, mp);
+
+        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
+        sp_2048_cond_sub_64(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -7099,163 +7253,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_2048_cond_sub_64(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 128];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 128), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 128;
-        }
-
-        sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_64(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
-        if (reduceA != 0) {
-            err = sp_2048_mod_64(t[1] + 64, a, m);
-            if (err == MP_OKAY) {
-                err = sp_2048_mod_64(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
-            err = sp_2048_mod_64(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
-        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
-        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
-        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_2048_mont_sqr_64(t[ 8], t[ 4], m, mp);
-        sp_2048_mont_mul_64(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_2048_mont_sqr_64(t[10], t[ 5], m, mp);
-        sp_2048_mont_mul_64(t[11], t[ 6], t[ 5], m, mp);
-        sp_2048_mont_sqr_64(t[12], t[ 6], m, mp);
-        sp_2048_mont_mul_64(t[13], t[ 7], t[ 6], m, mp);
-        sp_2048_mont_sqr_64(t[14], t[ 7], m, mp);
-        sp_2048_mont_mul_64(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_64(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_64(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_64(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_64(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_64(t[20], t[10], m, mp);
-        sp_2048_mont_mul_64(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_64(t[22], t[11], m, mp);
-        sp_2048_mont_mul_64(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_64(t[24], t[12], m, mp);
-        sp_2048_mont_mul_64(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_64(t[26], t[13], m, mp);
-        sp_2048_mont_mul_64(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_64(t[28], t[14], m, mp);
-        sp_2048_mont_mul_64(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_64(t[30], t[15], m, mp);
-        sp_2048_mont_mul_64(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-
-            sp_2048_mont_mul_64(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
-        sp_2048_mont_reduce_64(r, m, mp);
-
-        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
-        sp_2048_cond_sub_64(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -7286,7 +7283,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[64 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -7384,7 +7381,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -7680,7 +7677,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -7771,7 +7768,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[32], 0, sizeof(sp_digit) * 32);
         sp_2048_add_64(r, tmpb, tmpa);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -8471,7 +8468,7 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
         for (i=0; i<256 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -8670,7 +8667,7 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_3072_to_bin(sp_digit* r, byte* a)
+static void sp_3072_to_bin_96(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -8704,6 +8701,18 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_96(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_96(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -14652,7 +14661,7 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int32_t sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
+static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = -1;
     sp_digit one = 1;
@@ -17147,7 +17156,7 @@ static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int32_t sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
+static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = -1;
     sp_digit one = 1;
@@ -18368,6 +18377,137 @@ static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 192];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 192), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 192;
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_96(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 96U);
+        if (reduceA != 0) {
+            err = sp_3072_mod_96(t[1] + 96, a, m);
+            if (err == MP_OKAY) {
+                err = sp_3072_mod_96(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 96, a, sizeof(sp_digit) * 96);
+            err = sp_3072_mod_96(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_96(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_96(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_96(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_96(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_96(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_96(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 96);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_3072_mont_sqr_96(r, r, m, mp);
+            sp_3072_mont_sqr_96(r, r, m, mp);
+            sp_3072_mont_sqr_96(r, r, m, mp);
+
+            sp_3072_mont_mul_96(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
+        sp_3072_mont_reduce_96(r, m, mp);
+
+        mask = 0 - (sp_3072_cmp_96(r, m) >= 0);
+        sp_3072_cond_sub_96(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -18488,163 +18628,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_3072_cond_sub_96(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 192];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 192), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 192;
-        }
-
-        sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_96(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 96U);
-        if (reduceA != 0) {
-            err = sp_3072_mod_96(t[1] + 96, a, m);
-            if (err == MP_OKAY) {
-                err = sp_3072_mod_96(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 96, a, sizeof(sp_digit) * 96);
-            err = sp_3072_mod_96(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_3072_mont_sqr_96(t[ 2], t[ 1], m, mp);
-        sp_3072_mont_mul_96(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_3072_mont_sqr_96(t[ 4], t[ 2], m, mp);
-        sp_3072_mont_mul_96(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_3072_mont_sqr_96(t[ 6], t[ 3], m, mp);
-        sp_3072_mont_mul_96(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_3072_mont_sqr_96(t[ 8], t[ 4], m, mp);
-        sp_3072_mont_mul_96(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_3072_mont_sqr_96(t[10], t[ 5], m, mp);
-        sp_3072_mont_mul_96(t[11], t[ 6], t[ 5], m, mp);
-        sp_3072_mont_sqr_96(t[12], t[ 6], m, mp);
-        sp_3072_mont_mul_96(t[13], t[ 7], t[ 6], m, mp);
-        sp_3072_mont_sqr_96(t[14], t[ 7], m, mp);
-        sp_3072_mont_mul_96(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_96(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_96(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_96(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_96(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_96(t[20], t[10], m, mp);
-        sp_3072_mont_mul_96(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_96(t[22], t[11], m, mp);
-        sp_3072_mont_mul_96(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_96(t[24], t[12], m, mp);
-        sp_3072_mont_mul_96(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_96(t[26], t[13], m, mp);
-        sp_3072_mont_mul_96(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_96(t[28], t[14], m, mp);
-        sp_3072_mont_mul_96(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_96(t[30], t[15], m, mp);
-        sp_3072_mont_mul_96(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 96);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-
-            sp_3072_mont_mul_96(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
-        sp_3072_mont_reduce_96(r, m, mp);
-
-        mask = 0 - (sp_3072_cmp_96(r, m) >= 0);
-        sp_3072_cond_sub_96(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -18675,7 +18658,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[96 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -18773,7 +18756,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -19149,7 +19132,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -19240,7 +19223,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[48], 0, sizeof(sp_digit) * 48);
         sp_3072_add_96(r, tmpb, tmpa);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -20132,7 +20115,7 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
         for (i=0; i<384 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -20331,7 +20314,7 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_4096_to_bin(sp_digit* r, byte* a)
+static void sp_4096_to_bin_128(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -20365,6 +20348,18 @@ static void sp_4096_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_128(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_128(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Sub b from a into a. (a -= b)
  *
@@ -24698,7 +24693,7 @@ static void sp_4096_mask_128(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int32_t sp_4096_cmp_128(const sp_digit* a, const sp_digit* b)
+static sp_int32 sp_4096_cmp_128(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = -1;
     sp_digit one = 1;
@@ -26271,6 +26266,137 @@ static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const
 static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 256];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 256), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 256;
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_128(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 128U);
+        if (reduceA != 0) {
+            err = sp_4096_mod_128(t[1] + 128, a, m);
+            if (err == MP_OKAY) {
+                err = sp_4096_mod_128(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 128, a, sizeof(sp_digit) * 128);
+            err = sp_4096_mod_128(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mont_sqr_128(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_128(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_128(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_128(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_128(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_128(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 128);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_4096_mont_sqr_128(r, r, m, mp);
+            sp_4096_mont_sqr_128(r, r, m, mp);
+            sp_4096_mont_sqr_128(r, r, m, mp);
+
+            sp_4096_mont_mul_128(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
+        sp_4096_mont_reduce_128(r, m, mp);
+
+        mask = 0 - (sp_4096_cmp_128(r, m) >= 0);
+        sp_4096_cond_sub_128(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -26391,163 +26517,6 @@ static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e
         sp_4096_cond_sub_128(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 256];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 256), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 256;
-        }
-
-        sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_128(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 128U);
-        if (reduceA != 0) {
-            err = sp_4096_mod_128(t[1] + 128, a, m);
-            if (err == MP_OKAY) {
-                err = sp_4096_mod_128(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 128, a, sizeof(sp_digit) * 128);
-            err = sp_4096_mod_128(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_4096_mont_sqr_128(t[ 2], t[ 1], m, mp);
-        sp_4096_mont_mul_128(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_4096_mont_sqr_128(t[ 4], t[ 2], m, mp);
-        sp_4096_mont_mul_128(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_4096_mont_sqr_128(t[ 6], t[ 3], m, mp);
-        sp_4096_mont_mul_128(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_4096_mont_sqr_128(t[ 8], t[ 4], m, mp);
-        sp_4096_mont_mul_128(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_4096_mont_sqr_128(t[10], t[ 5], m, mp);
-        sp_4096_mont_mul_128(t[11], t[ 6], t[ 5], m, mp);
-        sp_4096_mont_sqr_128(t[12], t[ 6], m, mp);
-        sp_4096_mont_mul_128(t[13], t[ 7], t[ 6], m, mp);
-        sp_4096_mont_sqr_128(t[14], t[ 7], m, mp);
-        sp_4096_mont_mul_128(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_128(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_128(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_128(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_128(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_128(t[20], t[10], m, mp);
-        sp_4096_mont_mul_128(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_128(t[22], t[11], m, mp);
-        sp_4096_mont_mul_128(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_128(t[24], t[12], m, mp);
-        sp_4096_mont_mul_128(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_128(t[26], t[13], m, mp);
-        sp_4096_mont_mul_128(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_128(t[28], t[14], m, mp);
-        sp_4096_mont_mul_128(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_128(t[30], t[15], m, mp);
-        sp_4096_mont_mul_128(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 128);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-
-            sp_4096_mont_mul_128(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
-        sp_4096_mont_reduce_128(r, m, mp);
-
-        mask = 0 - (sp_4096_cmp_128(r, m) >= 0);
-        sp_4096_cond_sub_128(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -26578,7 +26547,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[128 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -26676,7 +26645,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -27132,7 +27101,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -27223,7 +27192,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[64], 0, sizeof(sp_digit) * 64);
         sp_4096_add_128(r, tmpb, tmpa);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -28307,7 +28276,7 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
         for (i=0; i<512 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -31245,7 +31214,7 @@ static void sp_256_mont_inv_8(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int32_t sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
+static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = -1;
     sp_digit one = 1;
@@ -31588,7 +31557,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*8;
-    int32_t n;
+    sp_int32 n;
 
     sp_256_mont_inv_8(t1, p->z, t + 2*8);
 
@@ -35645,7 +35614,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_256_to_bin(sp_digit* r, byte* a)
+static void sp_256_to_bin_8(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -35728,7 +35697,7 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_256_ecc_mulmod_8(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_256_to_bin(point->x, out);
+        sp_256_to_bin_8(point->x, out);
         *outLen = 32;
     }
 
@@ -36302,7 +36271,7 @@ static int sp_256_calc_s_8(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int32_t c;
+    sp_int32 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -36414,7 +36383,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int32_t c;
+        sp_int32 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 8U);
         sp_256_norm_8(ctx->r);
@@ -36463,7 +36432,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int32_t c;
+        sp_int32 c;
         sp_256_norm_8(ctx->x);
         carry = sp_256_add_8(ctx->s, ctx->e, ctx->x);
         sp_256_cond_sub_8(ctx->s, ctx->s,
@@ -36533,7 +36502,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int32_t c;
+    sp_int32 c;
     int err = MP_OKAY;
     int i;
 
@@ -37145,7 +37114,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int32_t c = 0;
+        sp_int32 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_256_cmp_8(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -37200,7 +37169,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_256* p2 = NULL;
     sp_digit carry;
-    int32_t c = 0;
+    sp_int32 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -40667,7 +40636,7 @@ static void sp_384_mont_inv_12(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int32_t sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
+static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = -1;
     sp_digit one = 1;
@@ -40860,7 +40829,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*12;
-    int32_t n;
+    sp_int32 n;
 
     sp_384_mont_inv_12(t1, p->z, t + 2*12);
 
@@ -44864,7 +44833,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_384_to_bin(sp_digit* r, byte* a)
+static void sp_384_to_bin_12(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -44947,7 +44916,7 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_384_ecc_mulmod_12(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_384_to_bin(point->x, out);
+        sp_384_to_bin_12(point->x, out);
         *outLen = 48;
     }
 
@@ -45540,7 +45509,7 @@ static int sp_384_calc_s_12(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int32_t c;
+    sp_int32 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -45652,7 +45621,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int32_t c;
+        sp_int32 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 12U);
         sp_384_norm_12(ctx->r);
@@ -45701,7 +45670,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int32_t c;
+        sp_int32 c;
         sp_384_norm_12(ctx->x);
         carry = sp_384_add_12(ctx->s, ctx->e, ctx->x);
         sp_384_cond_sub_12(ctx->s, ctx->s,
@@ -45771,7 +45740,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int32_t c;
+    sp_int32 c;
     int err = MP_OKAY;
     int i;
 
@@ -46430,7 +46399,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int32_t c = 0;
+        sp_int32 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_384_cmp_12(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -46485,7 +46454,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_384* p2 = NULL;
     sp_digit carry;
-    int32_t c = 0;
+    sp_int32 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -51432,7 +51401,7 @@ static void sp_1024_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int32_t sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
+static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = -1;
     sp_digit one = 1;
@@ -52579,7 +52548,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*32;
-    int32_t n;
+    sp_int32 n;
 
     sp_1024_mont_inv_32(t1, p->z, t + 2*32);
 
@@ -62906,7 +62875,7 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     sp_digit t1[32 * 4];
 #endif
     sp_digit* t2 = NULL;
-    int32_t n;
+    sp_int32 n;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c
index 478c39ed6..1e321fdde 100644
--- a/wolfcrypt/src/sp_arm64.c
+++ b/wolfcrypt/src/sp_arm64.c
@@ -47,6 +47,17 @@
 #include <wolfssl/wolfcrypt/sp.h>
 
 #ifdef WOLFSSL_SP_ARM64_ASM
+#define SP_PRINT_NUM(var, name, total, words, bits)     \
+    do {                                                \
+        int ii                                          \
+        fprintf(stderr, name "=0x");                    \
+        for (ii = words - 1; ii >= 0; ii--)             \
+            fprintf(stderr, SP_PRINT_FMT, (var)[ii]);   \
+        fprintf(stderr, "\n");                         \
+    } while (0)
+
+#define SP_PRINT_VAL(var, name)                         \
+    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
 /* Read big endian unsigned byte array into r.
@@ -186,7 +197,7 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_2048_to_bin(sp_digit* r, byte* a)
+static void sp_2048_to_bin_32(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -203,6 +214,18 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_32(a)
+
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_32(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -2987,7 +3010,7 @@ static sp_digit div_2048_word_16(sp_digit d1, sp_digit d0, sp_digit div)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int64_t sp_2048_cmp_16(const sp_digit* a, const sp_digit* b)
+static sp_int64 sp_2048_cmp_16(const sp_digit* a, const sp_digit* b)
 {
 #ifdef WOLFSSL_SP_SMALL
     __asm__ __volatile__ (
@@ -3135,7 +3158,7 @@ static int64_t sp_2048_cmp_16(const sp_digit* a, const sp_digit* b)
     );
 #endif
 
-    return (int64_t)a;
+    return (sp_int64)a;
 }
 
 /* Divide d in a and put remainder into r (m*d + r = a)
@@ -4239,7 +4262,7 @@ static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int64_t sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
+static sp_int64 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
 {
 #ifdef WOLFSSL_SP_SMALL
     __asm__ __volatile__ (
@@ -4499,7 +4522,7 @@ static int64_t sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
     );
 #endif
 
-    return (int64_t)a;
+    return (sp_int64)a;
 }
 
 /* Divide d in a and put remainder into r (m*d + r = a)
@@ -4766,6 +4789,137 @@ static WC_INLINE int sp_2048_mod_32_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 64];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 64), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 64;
+        }
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_32(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 32U);
+        if (reduceA != 0) {
+            err = sp_2048_mod_32(t[1] + 32, a, m);
+            if (err == MP_OKAY) {
+                err = sp_2048_mod_32(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
+            err = sp_2048_mod_32(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_mont_sqr_32(t[ 2], t[ 1], m, mp);
+        sp_2048_mont_mul_32(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_2048_mont_sqr_32(t[ 4], t[ 2], m, mp);
+        sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp);
+        sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 64;
+        n = e[i--];
+        c = bits & 63;
+        if (c == 0) {
+            c = 64;
+        }
+        c -= bits % 3;
+        if (c == 64) {
+            c = 61;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 64 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 61);
+                n <<= 3;
+                c = 61;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 61);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (64 - c));
+                n <<= c;
+                c = 64 - c;
+            }
+            else {
+                y = (byte)((n >> 61) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_2048_mont_sqr_32(r, r, m, mp);
+            sp_2048_mont_sqr_32(r, r, m, mp);
+            sp_2048_mont_sqr_32(r, r, m, mp);
+
+            sp_2048_mont_mul_32(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32U);
+        sp_2048_mont_reduce_32(r, m, mp);
+
+        mask = 0 - (sp_2048_cmp_32(r, m) >= 0);
+        sp_2048_cond_sub_32(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -4886,163 +5040,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_2048_cond_sub_32(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 64];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 64), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 64;
-        }
-
-        sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_32(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 32U);
-        if (reduceA != 0) {
-            err = sp_2048_mod_32(t[1] + 32, a, m);
-            if (err == MP_OKAY) {
-                err = sp_2048_mod_32(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
-            err = sp_2048_mod_32(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_2048_mont_sqr_32(t[ 2], t[ 1], m, mp);
-        sp_2048_mont_mul_32(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_2048_mont_sqr_32(t[ 4], t[ 2], m, mp);
-        sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp);
-        sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_2048_mont_sqr_32(t[ 8], t[ 4], m, mp);
-        sp_2048_mont_mul_32(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_2048_mont_sqr_32(t[10], t[ 5], m, mp);
-        sp_2048_mont_mul_32(t[11], t[ 6], t[ 5], m, mp);
-        sp_2048_mont_sqr_32(t[12], t[ 6], m, mp);
-        sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp);
-        sp_2048_mont_sqr_32(t[14], t[ 7], m, mp);
-        sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_32(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_32(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_32(t[20], t[10], m, mp);
-        sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_32(t[22], t[11], m, mp);
-        sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_32(t[24], t[12], m, mp);
-        sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_32(t[26], t[13], m, mp);
-        sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_32(t[28], t[14], m, mp);
-        sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_32(t[30], t[15], m, mp);
-        sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 64;
-        n = e[i--];
-        c = bits & 63;
-        if (c == 0) {
-            c = 64;
-        }
-        c -= bits % 5;
-        if (c == 64) {
-            c = 59;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 64 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 59);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (64 - c));
-                n <<= c;
-                c = 64 - c;
-            }
-            else {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_2048_mont_sqr_32(r, r, m, mp);
-            sp_2048_mont_sqr_32(r, r, m, mp);
-            sp_2048_mont_sqr_32(r, r, m, mp);
-            sp_2048_mont_sqr_32(r, r, m, mp);
-            sp_2048_mont_sqr_32(r, r, m, mp);
-
-            sp_2048_mont_mul_32(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32U);
-        sp_2048_mont_reduce_32(r, m, mp);
-
-        mask = 0 - (sp_2048_cmp_32(r, m) >= 0);
-        sp_2048_cond_sub_32(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -5073,7 +5070,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[32 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -5171,7 +5168,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_32(r, out);
         *outLen = 256;
     }
 
@@ -5363,7 +5360,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_32(r, out);
         *outLen = 256;
     }
 
@@ -5454,7 +5451,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[16], 0, sizeof(sp_digit) * 16);
         sp_2048_add_32(r, tmpb, tmpa);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_32(r, out);
         *outLen = 256;
     }
 
@@ -5964,7 +5961,7 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_32(r, out);
         *outLen = 256;
         for (i=0; i<256 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -6174,7 +6171,7 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_3072_to_bin(sp_digit* r, byte* a)
+static void sp_3072_to_bin_48(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -6191,6 +6188,18 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_48(a)
+
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_48(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -10512,7 +10521,7 @@ static sp_digit div_3072_word_24(sp_digit d1, sp_digit d0, sp_digit div)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int64_t sp_3072_cmp_24(const sp_digit* a, const sp_digit* b)
+static sp_int64 sp_3072_cmp_24(const sp_digit* a, const sp_digit* b)
 {
 #ifdef WOLFSSL_SP_SMALL
     __asm__ __volatile__ (
@@ -10716,7 +10725,7 @@ static int64_t sp_3072_cmp_24(const sp_digit* a, const sp_digit* b)
     );
 #endif
 
-    return (int64_t)a;
+    return (sp_int64)a;
 }
 
 /* Divide d in a and put remainder into r (m*d + r = a)
@@ -12092,7 +12101,7 @@ static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int64_t sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
+static sp_int64 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
 {
 #ifdef WOLFSSL_SP_SMALL
     __asm__ __volatile__ (
@@ -12464,7 +12473,7 @@ static int64_t sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
     );
 #endif
 
-    return (int64_t)a;
+    return (sp_int64)a;
 }
 
 /* Divide d in a and put remainder into r (m*d + r = a)
@@ -12771,6 +12780,137 @@ static WC_INLINE int sp_3072_mod_48_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 96];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 96), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 96;
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_48(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 48U);
+        if (reduceA != 0) {
+            err = sp_3072_mod_48(t[1] + 48, a, m);
+            if (err == MP_OKAY) {
+                err = sp_3072_mod_48(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 48, a, sizeof(sp_digit) * 48);
+            err = sp_3072_mod_48(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_48(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_48(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_48(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_48(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_48(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_48(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 64;
+        n = e[i--];
+        c = bits & 63;
+        if (c == 0) {
+            c = 64;
+        }
+        c -= bits % 3;
+        if (c == 64) {
+            c = 61;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 64 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 48);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 61);
+                n <<= 3;
+                c = 61;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 61);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (64 - c));
+                n <<= c;
+                c = 64 - c;
+            }
+            else {
+                y = (byte)((n >> 61) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_3072_mont_sqr_48(r, r, m, mp);
+            sp_3072_mont_sqr_48(r, r, m, mp);
+            sp_3072_mont_sqr_48(r, r, m, mp);
+
+            sp_3072_mont_mul_48(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[48], 0, sizeof(sp_digit) * 48U);
+        sp_3072_mont_reduce_48(r, m, mp);
+
+        mask = 0 - (sp_3072_cmp_48(r, m) >= 0);
+        sp_3072_cond_sub_48(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -12891,163 +13031,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_3072_cond_sub_48(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 96];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 96), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 96;
-        }
-
-        sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_48(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 48U);
-        if (reduceA != 0) {
-            err = sp_3072_mod_48(t[1] + 48, a, m);
-            if (err == MP_OKAY) {
-                err = sp_3072_mod_48(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 48, a, sizeof(sp_digit) * 48);
-            err = sp_3072_mod_48(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_3072_mont_sqr_48(t[ 2], t[ 1], m, mp);
-        sp_3072_mont_mul_48(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_3072_mont_sqr_48(t[ 4], t[ 2], m, mp);
-        sp_3072_mont_mul_48(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_3072_mont_sqr_48(t[ 6], t[ 3], m, mp);
-        sp_3072_mont_mul_48(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_3072_mont_sqr_48(t[ 8], t[ 4], m, mp);
-        sp_3072_mont_mul_48(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_3072_mont_sqr_48(t[10], t[ 5], m, mp);
-        sp_3072_mont_mul_48(t[11], t[ 6], t[ 5], m, mp);
-        sp_3072_mont_sqr_48(t[12], t[ 6], m, mp);
-        sp_3072_mont_mul_48(t[13], t[ 7], t[ 6], m, mp);
-        sp_3072_mont_sqr_48(t[14], t[ 7], m, mp);
-        sp_3072_mont_mul_48(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_48(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_48(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_48(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_48(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_48(t[20], t[10], m, mp);
-        sp_3072_mont_mul_48(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_48(t[22], t[11], m, mp);
-        sp_3072_mont_mul_48(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_48(t[24], t[12], m, mp);
-        sp_3072_mont_mul_48(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_48(t[26], t[13], m, mp);
-        sp_3072_mont_mul_48(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_48(t[28], t[14], m, mp);
-        sp_3072_mont_mul_48(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_48(t[30], t[15], m, mp);
-        sp_3072_mont_mul_48(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 64;
-        n = e[i--];
-        c = bits & 63;
-        if (c == 0) {
-            c = 64;
-        }
-        c -= bits % 5;
-        if (c == 64) {
-            c = 59;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 64 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 48);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 59);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (64 - c));
-                n <<= c;
-                c = 64 - c;
-            }
-            else {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_3072_mont_sqr_48(r, r, m, mp);
-            sp_3072_mont_sqr_48(r, r, m, mp);
-            sp_3072_mont_sqr_48(r, r, m, mp);
-            sp_3072_mont_sqr_48(r, r, m, mp);
-            sp_3072_mont_sqr_48(r, r, m, mp);
-
-            sp_3072_mont_mul_48(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[48], 0, sizeof(sp_digit) * 48U);
-        sp_3072_mont_reduce_48(r, m, mp);
-
-        mask = 0 - (sp_3072_cmp_48(r, m) >= 0);
-        sp_3072_cond_sub_48(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -13078,7 +13061,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[48 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -13176,7 +13159,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_48(r, out);
         *outLen = 384;
     }
 
@@ -13396,7 +13379,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_48(r, out);
         *outLen = 384;
     }
 
@@ -13487,7 +13470,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[24], 0, sizeof(sp_digit) * 24);
         sp_3072_add_48(r, tmpb, tmpa);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_48(r, out);
         *outLen = 384;
     }
 
@@ -14093,7 +14076,7 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_48(r, out);
         *outLen = 384;
         for (i=0; i<384 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -14303,7 +14286,7 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_4096_to_bin(sp_digit* r, byte* a)
+static void sp_4096_to_bin_64(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -14320,6 +14303,18 @@ static void sp_4096_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_64(a)
+
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_64(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Add b to a into r. (r = a + b)
  *
@@ -17130,7 +17125,7 @@ static void sp_4096_mask_64(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int64_t sp_4096_cmp_64(const sp_digit* a, const sp_digit* b)
+static sp_int64 sp_4096_cmp_64(const sp_digit* a, const sp_digit* b)
 {
 #ifdef WOLFSSL_SP_SMALL
     __asm__ __volatile__ (
@@ -17614,7 +17609,7 @@ static int64_t sp_4096_cmp_64(const sp_digit* a, const sp_digit* b)
     );
 #endif
 
-    return (int64_t)a;
+    return (sp_int64)a;
 }
 
 /* Divide d in a and put remainder into r (m*d + r = a)
@@ -17961,6 +17956,137 @@ static WC_INLINE int sp_4096_mod_64_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 128];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 128), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 128;
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_64(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
+        if (reduceA != 0) {
+            err = sp_4096_mod_64(t[1] + 64, a, m);
+            if (err == MP_OKAY) {
+                err = sp_4096_mod_64(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
+            err = sp_4096_mod_64(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mont_sqr_64(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_64(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_64(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 64;
+        n = e[i--];
+        c = bits & 63;
+        if (c == 0) {
+            c = 64;
+        }
+        c -= bits % 3;
+        if (c == 64) {
+            c = 61;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 64 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 61);
+                n <<= 3;
+                c = 61;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 61);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (64 - c));
+                n <<= c;
+                c = 64 - c;
+            }
+            else {
+                y = (byte)((n >> 61) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_4096_mont_sqr_64(r, r, m, mp);
+            sp_4096_mont_sqr_64(r, r, m, mp);
+            sp_4096_mont_sqr_64(r, r, m, mp);
+
+            sp_4096_mont_mul_64(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
+        sp_4096_mont_reduce_64(r, m, mp);
+
+        mask = 0 - (sp_4096_cmp_64(r, m) >= 0);
+        sp_4096_cond_sub_64(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -18081,163 +18207,6 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_4096_cond_sub_64(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 128];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 128), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 128;
-        }
-
-        sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_64(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
-        if (reduceA != 0) {
-            err = sp_4096_mod_64(t[1] + 64, a, m);
-            if (err == MP_OKAY) {
-                err = sp_4096_mod_64(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
-            err = sp_4096_mod_64(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_4096_mont_sqr_64(t[ 2], t[ 1], m, mp);
-        sp_4096_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_4096_mont_sqr_64(t[ 4], t[ 2], m, mp);
-        sp_4096_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_4096_mont_sqr_64(t[ 6], t[ 3], m, mp);
-        sp_4096_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_4096_mont_sqr_64(t[ 8], t[ 4], m, mp);
-        sp_4096_mont_mul_64(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_4096_mont_sqr_64(t[10], t[ 5], m, mp);
-        sp_4096_mont_mul_64(t[11], t[ 6], t[ 5], m, mp);
-        sp_4096_mont_sqr_64(t[12], t[ 6], m, mp);
-        sp_4096_mont_mul_64(t[13], t[ 7], t[ 6], m, mp);
-        sp_4096_mont_sqr_64(t[14], t[ 7], m, mp);
-        sp_4096_mont_mul_64(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_64(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_64(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_64(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_64(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_64(t[20], t[10], m, mp);
-        sp_4096_mont_mul_64(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_64(t[22], t[11], m, mp);
-        sp_4096_mont_mul_64(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_64(t[24], t[12], m, mp);
-        sp_4096_mont_mul_64(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_64(t[26], t[13], m, mp);
-        sp_4096_mont_mul_64(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_64(t[28], t[14], m, mp);
-        sp_4096_mont_mul_64(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_64(t[30], t[15], m, mp);
-        sp_4096_mont_mul_64(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 64;
-        n = e[i--];
-        c = bits & 63;
-        if (c == 0) {
-            c = 64;
-        }
-        c -= bits % 5;
-        if (c == 64) {
-            c = 59;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 64 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 59);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (64 - c));
-                n <<= c;
-                c = 64 - c;
-            }
-            else {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_4096_mont_sqr_64(r, r, m, mp);
-            sp_4096_mont_sqr_64(r, r, m, mp);
-            sp_4096_mont_sqr_64(r, r, m, mp);
-            sp_4096_mont_sqr_64(r, r, m, mp);
-            sp_4096_mont_sqr_64(r, r, m, mp);
-
-            sp_4096_mont_mul_64(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
-        sp_4096_mont_reduce_64(r, m, mp);
-
-        mask = 0 - (sp_4096_cmp_64(r, m) >= 0);
-        sp_4096_cond_sub_64(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -18268,7 +18237,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[64 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -18366,7 +18335,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_64(r, out);
         *outLen = 512;
     }
 
@@ -18614,7 +18583,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_64(r, out);
         *outLen = 512;
     }
 
@@ -18705,7 +18674,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[32], 0, sizeof(sp_digit) * 32);
         sp_4096_add_64(r, tmpb, tmpa);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_64(r, out);
         *outLen = 512;
     }
 
@@ -19407,7 +19376,7 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_64(r, out);
         *outLen = 512;
         for (i=0; i<512 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -20555,7 +20524,7 @@ static void sp_256_mont_inv_4(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int64_t sp_256_cmp_4(const sp_digit* a, const sp_digit* b)
+static sp_int64 sp_256_cmp_4(const sp_digit* a, const sp_digit* b)
 {
 #ifdef WOLFSSL_SP_SMALL
     __asm__ __volatile__ (
@@ -20619,7 +20588,7 @@ static int64_t sp_256_cmp_4(const sp_digit* a, const sp_digit* b)
     );
 #endif
 
-    return (int64_t)a;
+    return (sp_int64)a;
 }
 
 /* Normalize the values in each word to 64.
@@ -20833,7 +20802,7 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*4;
-    int64_t n;
+    sp_int64 n;
 
     sp_256_mont_inv_4(t1, p->z, t + 2*4);
 
@@ -37359,7 +37328,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_256_to_bin(sp_digit* r, byte* a)
+static void sp_256_to_bin_4(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -37425,7 +37394,7 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_256_ecc_mulmod_4(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_256_to_bin(point->x, out);
+        sp_256_to_bin_4(point->x, out);
         *outLen = 32;
     }
 
@@ -37861,7 +37830,7 @@ static int sp_256_calc_s_4(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int64_t c;
+    sp_int64 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -37973,7 +37942,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int64_t c;
+        sp_int64 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 4U);
         sp_256_norm_4(ctx->r);
@@ -38022,7 +37991,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int64_t c;
+        sp_int64 c;
         sp_256_norm_4(ctx->x);
         carry = sp_256_add_4(ctx->s, ctx->e, ctx->x);
         sp_256_cond_sub_4(ctx->s, ctx->s,
@@ -38092,7 +38061,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int64_t c;
+    sp_int64 c;
     int err = MP_OKAY;
     int i;
 
@@ -38700,7 +38669,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int64_t c = 0;
+        sp_int64 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_256_cmp_4(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -38755,7 +38724,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_256* p2 = NULL;
     sp_digit carry;
-    int64_t c = 0;
+    sp_int64 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -40710,7 +40679,7 @@ static void sp_384_mont_inv_6(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int64_t sp_384_cmp_6(const sp_digit* a, const sp_digit* b)
+static sp_int64 sp_384_cmp_6(const sp_digit* a, const sp_digit* b)
 {
 #ifdef WOLFSSL_SP_SMALL
     __asm__ __volatile__ (
@@ -40788,7 +40757,7 @@ static int64_t sp_384_cmp_6(const sp_digit* a, const sp_digit* b)
     );
 #endif
 
-    return (int64_t)a;
+    return (sp_int64)a;
 }
 
 /* Normalize the values in each word to 64.
@@ -40808,7 +40777,7 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*6;
-    int64_t n;
+    sp_int64 n;
 
     sp_384_mont_inv_6(t1, p->z, t + 2*6);
 
@@ -63083,7 +63052,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_384_to_bin(sp_digit* r, byte* a)
+static void sp_384_to_bin_6(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -63149,7 +63118,7 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_384_ecc_mulmod_6(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_384_to_bin(point->x, out);
+        sp_384_to_bin_6(point->x, out);
         *outLen = 48;
     }
 
@@ -63620,7 +63589,7 @@ static int sp_384_calc_s_6(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int64_t c;
+    sp_int64 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -63732,7 +63701,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int64_t c;
+        sp_int64 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 6U);
         sp_384_norm_6(ctx->r);
@@ -63781,7 +63750,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int64_t c;
+        sp_int64 c;
         sp_384_norm_6(ctx->x);
         carry = sp_384_add_6(ctx->s, ctx->e, ctx->x);
         sp_384_cond_sub_6(ctx->s, ctx->s,
@@ -63851,7 +63820,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int64_t c;
+    sp_int64 c;
     int err = MP_OKAY;
     int i;
 
@@ -64344,7 +64313,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int64_t c = 0;
+        sp_int64 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_384_cmp_6(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -64399,7 +64368,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_384* p2 = NULL;
     sp_digit carry;
-    int64_t c = 0;
+    sp_int64 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -66726,7 +66695,7 @@ static void sp_1024_mask_16(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static int64_t sp_1024_cmp_16(const sp_digit* a, const sp_digit* b)
+static sp_int64 sp_1024_cmp_16(const sp_digit* a, const sp_digit* b)
 {
 #ifdef WOLFSSL_SP_SMALL
     __asm__ __volatile__ (
@@ -66874,7 +66843,7 @@ static int64_t sp_1024_cmp_16(const sp_digit* a, const sp_digit* b)
     );
 #endif
 
-    return (int64_t)a;
+    return (sp_int64)a;
 }
 
 /* Divide d in a and put remainder into r (m*d + r = a)
@@ -67625,7 +67594,7 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*16;
-    int64_t n;
+    sp_int64 n;
 
     sp_1024_mont_inv_16(t1, p->z, t + 2*16);
 
@@ -76752,7 +76721,7 @@ static int sp_1024_ecc_is_point_16(const sp_point_1024* point,
     sp_digit t1[16 * 4];
 #endif
     sp_digit* t2 = NULL;
-    int64_t n;
+    sp_int64 n;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c
index 0cd63fff2..cecf564b0 100644
--- a/wolfcrypt/src/sp_armthumb.c
+++ b/wolfcrypt/src/sp_armthumb.c
@@ -47,6 +47,17 @@
 #include <wolfssl/wolfcrypt/sp.h>
 
 #ifdef WOLFSSL_SP_ARM_THUMB_ASM
+#define SP_PRINT_NUM(var, name, total, words, bits)     \
+    do {                                                \
+        int ii                                          \
+        fprintf(stderr, name "=0x");                    \
+        for (ii = words - 1; ii >= 0; ii--)             \
+            fprintf(stderr, SP_PRINT_FMT, (var)[ii]);   \
+        fprintf(stderr, "\n");                         \
+    } while (0)
+
+#define SP_PRINT_VAL(var, name)                         \
+    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
 /* Read big endian unsigned byte array into r.
@@ -175,7 +186,7 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_2048_to_bin(sp_digit* r, byte* a)
+static void sp_2048_to_bin_64(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -209,6 +220,18 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_64(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_64(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -6353,7 +6376,7 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0,
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
 {
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
@@ -7845,7 +7868,7 @@ static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
 {
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
@@ -8054,6 +8077,137 @@ static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 128];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 128), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 128;
+        }
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_64(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
+        if (reduceA != 0) {
+            err = sp_2048_mod_64(t[1] + 64, a, m);
+            if (err == MP_OKAY) {
+                err = sp_2048_mod_64(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
+            err = sp_2048_mod_64(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
+        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
+        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
+        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_2048_mont_sqr_64(r, r, m, mp);
+            sp_2048_mont_sqr_64(r, r, m, mp);
+            sp_2048_mont_sqr_64(r, r, m, mp);
+
+            sp_2048_mont_mul_64(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
+        sp_2048_mont_reduce_64(r, m, mp);
+
+        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
+        sp_2048_cond_sub_64(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -8174,163 +8328,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_2048_cond_sub_64(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 128];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 128), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 128;
-        }
-
-        sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_64(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
-        if (reduceA != 0) {
-            err = sp_2048_mod_64(t[1] + 64, a, m);
-            if (err == MP_OKAY) {
-                err = sp_2048_mod_64(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
-            err = sp_2048_mod_64(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
-        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
-        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
-        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_2048_mont_sqr_64(t[ 8], t[ 4], m, mp);
-        sp_2048_mont_mul_64(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_2048_mont_sqr_64(t[10], t[ 5], m, mp);
-        sp_2048_mont_mul_64(t[11], t[ 6], t[ 5], m, mp);
-        sp_2048_mont_sqr_64(t[12], t[ 6], m, mp);
-        sp_2048_mont_mul_64(t[13], t[ 7], t[ 6], m, mp);
-        sp_2048_mont_sqr_64(t[14], t[ 7], m, mp);
-        sp_2048_mont_mul_64(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_64(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_64(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_64(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_64(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_64(t[20], t[10], m, mp);
-        sp_2048_mont_mul_64(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_64(t[22], t[11], m, mp);
-        sp_2048_mont_mul_64(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_64(t[24], t[12], m, mp);
-        sp_2048_mont_mul_64(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_64(t[26], t[13], m, mp);
-        sp_2048_mont_mul_64(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_64(t[28], t[14], m, mp);
-        sp_2048_mont_mul_64(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_64(t[30], t[15], m, mp);
-        sp_2048_mont_mul_64(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-
-            sp_2048_mont_mul_64(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
-        sp_2048_mont_reduce_64(r, m, mp);
-
-        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
-        sp_2048_cond_sub_64(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -8361,7 +8358,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[64 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -8459,7 +8456,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -8612,7 +8609,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -8703,7 +8700,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[32], 0, sizeof(sp_digit) * 32);
         sp_2048_add_64(r, tmpb, tmpa);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -10473,7 +10470,7 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
         for (i=0; i<256 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -10672,7 +10669,7 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_3072_to_bin(sp_digit* r, byte* a)
+static void sp_3072_to_bin_96(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -10706,6 +10703,18 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_96(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_96(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -17922,7 +17931,7 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0,
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
 {
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
@@ -19419,7 +19428,7 @@ static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
 {
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
@@ -19633,6 +19642,137 @@ static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 192];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 192), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 192;
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_96(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 96U);
+        if (reduceA != 0) {
+            err = sp_3072_mod_96(t[1] + 96, a, m);
+            if (err == MP_OKAY) {
+                err = sp_3072_mod_96(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 96, a, sizeof(sp_digit) * 96);
+            err = sp_3072_mod_96(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_96(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_96(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_96(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_96(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_96(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_96(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 96);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_3072_mont_sqr_96(r, r, m, mp);
+            sp_3072_mont_sqr_96(r, r, m, mp);
+            sp_3072_mont_sqr_96(r, r, m, mp);
+
+            sp_3072_mont_mul_96(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
+        sp_3072_mont_reduce_96(r, m, mp);
+
+        mask = 0 - (sp_3072_cmp_96(r, m) >= 0);
+        sp_3072_cond_sub_96(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -19753,163 +19893,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_3072_cond_sub_96(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 192];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 192), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 192;
-        }
-
-        sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_96(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 96U);
-        if (reduceA != 0) {
-            err = sp_3072_mod_96(t[1] + 96, a, m);
-            if (err == MP_OKAY) {
-                err = sp_3072_mod_96(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 96, a, sizeof(sp_digit) * 96);
-            err = sp_3072_mod_96(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_3072_mont_sqr_96(t[ 2], t[ 1], m, mp);
-        sp_3072_mont_mul_96(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_3072_mont_sqr_96(t[ 4], t[ 2], m, mp);
-        sp_3072_mont_mul_96(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_3072_mont_sqr_96(t[ 6], t[ 3], m, mp);
-        sp_3072_mont_mul_96(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_3072_mont_sqr_96(t[ 8], t[ 4], m, mp);
-        sp_3072_mont_mul_96(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_3072_mont_sqr_96(t[10], t[ 5], m, mp);
-        sp_3072_mont_mul_96(t[11], t[ 6], t[ 5], m, mp);
-        sp_3072_mont_sqr_96(t[12], t[ 6], m, mp);
-        sp_3072_mont_mul_96(t[13], t[ 7], t[ 6], m, mp);
-        sp_3072_mont_sqr_96(t[14], t[ 7], m, mp);
-        sp_3072_mont_mul_96(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_96(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_96(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_96(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_96(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_96(t[20], t[10], m, mp);
-        sp_3072_mont_mul_96(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_96(t[22], t[11], m, mp);
-        sp_3072_mont_mul_96(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_96(t[24], t[12], m, mp);
-        sp_3072_mont_mul_96(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_96(t[26], t[13], m, mp);
-        sp_3072_mont_mul_96(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_96(t[28], t[14], m, mp);
-        sp_3072_mont_mul_96(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_96(t[30], t[15], m, mp);
-        sp_3072_mont_mul_96(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 96);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-
-            sp_3072_mont_mul_96(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
-        sp_3072_mont_reduce_96(r, m, mp);
-
-        mask = 0 - (sp_3072_cmp_96(r, m) >= 0);
-        sp_3072_cond_sub_96(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -19940,7 +19923,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[96 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -20038,7 +20021,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -20191,7 +20174,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -20282,7 +20265,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[48], 0, sizeof(sp_digit) * 48);
         sp_3072_add_96(r, tmpb, tmpa);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -22786,7 +22769,7 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
         for (i=0; i<384 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -22985,7 +22968,7 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_4096_to_bin(sp_digit* r, byte* a)
+static void sp_4096_to_bin_128(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -23019,6 +23002,18 @@ static void sp_4096_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_128(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_128(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Sub b from a into a. (a -= b)
  *
@@ -27482,7 +27477,8 @@ static void sp_4096_mask_128(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_4096_cmp_128(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_4096_cmp_128(const sp_digit* a,
+        const sp_digit* b)
 {
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
@@ -27696,6 +27692,137 @@ static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const
 static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 256];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 256), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 256;
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_128(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 128U);
+        if (reduceA != 0) {
+            err = sp_4096_mod_128(t[1] + 128, a, m);
+            if (err == MP_OKAY) {
+                err = sp_4096_mod_128(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 128, a, sizeof(sp_digit) * 128);
+            err = sp_4096_mod_128(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mont_sqr_128(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_128(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_128(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_128(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_128(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_128(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 128);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_4096_mont_sqr_128(r, r, m, mp);
+            sp_4096_mont_sqr_128(r, r, m, mp);
+            sp_4096_mont_sqr_128(r, r, m, mp);
+
+            sp_4096_mont_mul_128(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
+        sp_4096_mont_reduce_128(r, m, mp);
+
+        mask = 0 - (sp_4096_cmp_128(r, m) >= 0);
+        sp_4096_cond_sub_128(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -27816,163 +27943,6 @@ static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e
         sp_4096_cond_sub_128(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 256];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 256), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 256;
-        }
-
-        sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_128(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 128U);
-        if (reduceA != 0) {
-            err = sp_4096_mod_128(t[1] + 128, a, m);
-            if (err == MP_OKAY) {
-                err = sp_4096_mod_128(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 128, a, sizeof(sp_digit) * 128);
-            err = sp_4096_mod_128(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_4096_mont_sqr_128(t[ 2], t[ 1], m, mp);
-        sp_4096_mont_mul_128(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_4096_mont_sqr_128(t[ 4], t[ 2], m, mp);
-        sp_4096_mont_mul_128(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_4096_mont_sqr_128(t[ 6], t[ 3], m, mp);
-        sp_4096_mont_mul_128(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_4096_mont_sqr_128(t[ 8], t[ 4], m, mp);
-        sp_4096_mont_mul_128(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_4096_mont_sqr_128(t[10], t[ 5], m, mp);
-        sp_4096_mont_mul_128(t[11], t[ 6], t[ 5], m, mp);
-        sp_4096_mont_sqr_128(t[12], t[ 6], m, mp);
-        sp_4096_mont_mul_128(t[13], t[ 7], t[ 6], m, mp);
-        sp_4096_mont_sqr_128(t[14], t[ 7], m, mp);
-        sp_4096_mont_mul_128(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_128(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_128(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_128(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_128(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_128(t[20], t[10], m, mp);
-        sp_4096_mont_mul_128(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_128(t[22], t[11], m, mp);
-        sp_4096_mont_mul_128(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_128(t[24], t[12], m, mp);
-        sp_4096_mont_mul_128(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_128(t[26], t[13], m, mp);
-        sp_4096_mont_mul_128(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_128(t[28], t[14], m, mp);
-        sp_4096_mont_mul_128(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_128(t[30], t[15], m, mp);
-        sp_4096_mont_mul_128(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 128);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-
-            sp_4096_mont_mul_128(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
-        sp_4096_mont_reduce_128(r, m, mp);
-
-        mask = 0 - (sp_4096_cmp_128(r, m) >= 0);
-        sp_4096_cond_sub_128(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -28003,7 +27973,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[128 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -28101,7 +28071,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -28259,7 +28229,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -28350,7 +28320,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[64], 0, sizeof(sp_digit) * 64);
         sp_4096_add_128(r, tmpb, tmpa);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -31578,7 +31548,7 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
         for (i=0; i<512 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -33777,7 +33747,7 @@ static void sp_256_mont_inv_8(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
 {
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
@@ -33885,7 +33855,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*8;
-    int32_t n;
+    sp_int32 n;
 
     sp_256_mont_inv_8(t1, p->z, t + 2*8);
 
@@ -38580,7 +38550,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_256_to_bin(sp_digit* r, byte* a)
+static void sp_256_to_bin_8(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -38663,7 +38633,7 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_256_ecc_mulmod_8(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_256_to_bin(point->x, out);
+        sp_256_to_bin_8(point->x, out);
         *outLen = 32;
     }
 
@@ -39845,7 +39815,7 @@ static int sp_256_calc_s_8(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int32_t c;
+    sp_int32 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -39957,7 +39927,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int32_t c;
+        sp_int32 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 8U);
         sp_256_norm_8(ctx->r);
@@ -40006,7 +39976,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int32_t c;
+        sp_int32 c;
         sp_256_norm_8(ctx->x);
         carry = sp_256_add_8(ctx->s, ctx->e, ctx->x);
         sp_256_cond_sub_8(ctx->s, ctx->s,
@@ -40076,7 +40046,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int32_t c;
+    sp_int32 c;
     int err = MP_OKAY;
     int i;
 
@@ -41570,7 +41540,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int32_t c = 0;
+        sp_int32 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_256_cmp_8(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -41625,7 +41595,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_256* p2 = NULL;
     sp_digit carry;
-    int32_t c = 0;
+    sp_int32 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -44282,7 +44252,7 @@ static void sp_384_mont_inv_12(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
 {
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
@@ -44390,7 +44360,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*12;
-    int32_t n;
+    sp_int32 n;
 
     sp_384_mont_inv_12(t1, p->z, t + 2*12);
 
@@ -48556,7 +48526,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_384_to_bin(sp_digit* r, byte* a)
+static void sp_384_to_bin_12(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -48639,7 +48609,7 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_384_ecc_mulmod_12(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_384_to_bin(point->x, out);
+        sp_384_to_bin_12(point->x, out);
         *outLen = 48;
     }
 
@@ -49828,7 +49798,7 @@ static int sp_384_calc_s_12(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int32_t c;
+    sp_int32 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -49940,7 +49910,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int32_t c;
+        sp_int32 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 12U);
         sp_384_norm_12(ctx->r);
@@ -49989,7 +49959,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int32_t c;
+        sp_int32 c;
         sp_384_norm_12(ctx->x);
         carry = sp_384_add_12(ctx->s, ctx->e, ctx->x);
         sp_384_cond_sub_12(ctx->s, ctx->s,
@@ -50059,7 +50029,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int32_t c;
+    sp_int32 c;
     int err = MP_OKAY;
     int i;
 
@@ -51976,7 +51946,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int32_t c = 0;
+        sp_int32 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_384_cmp_12(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -52031,7 +52001,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_384* p2 = NULL;
     sp_digit carry;
-    int32_t c = 0;
+    sp_int32 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -55906,7 +55876,7 @@ static void sp_1024_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
 {
     __asm__ __volatile__ (
         "movs	r2, #0\n\t"
@@ -56820,7 +56790,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*32;
-    int32_t n;
+    sp_int32 n;
 
     sp_1024_mont_inv_32(t1, p->z, t + 2*32);
 
@@ -70672,7 +70642,7 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     sp_digit t1[32 * 4];
 #endif
     sp_digit* t2 = NULL;
-    int32_t n;
+    sp_int32 n;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c
index 390a6b767..47a836c13 100644
--- a/wolfcrypt/src/sp_c32.c
+++ b/wolfcrypt/src/sp_c32.c
@@ -52,10 +52,28 @@
 
 #ifndef WOLFSSL_SP_ASM
 #if SP_WORD_SIZE == 32
-#if ((!defined(WC_NO_CACHE_RESISTANT) && \
+#define SP_PRINT_NUM(var, name, total, words, bits)   \
+    do {                                              \
+        int ii;                                       \
+        byte n[bits / 8];                             \
+        sp_digit s[words];                            \
+        XMEMCPY(s, var, sizeof(s));                   \
+        sp_##total##_norm_##words(s);                 \
+        sp_##total##_to_bin_##words(s, n);            \
+        fprintf(stderr, name "=0x");                  \
+        for (ii=0; ii<bits/8; ii++)                   \
+            fprintf(stderr, "%02x", n[ii]);           \
+        fprintf(stderr, "\n");                       \
+    } while (0)
+
+#define SP_PRINT_VAL(var, name) \
+    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)
+
+#if (((!defined(WC_NO_CACHE_RESISTANT) && \
       (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH))) || \
      (defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP))) && \
-    (defined(WOLFSSL_HAVE_SP_ECC) || !defined(WOLFSSL_RSA_PUBLIC_ONLY))
+    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || (defined(WOLFSSL_SP_SMALL) && \
+    defined(WOLFSSL_HAVE_SP_ECC))
 /* Mask for address to obfuscate which of the two address will be used. */
 static const size_t addr_mask[2] = { 0, (size_t)-1 };
 #endif
@@ -82,9 +100,9 @@ static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
     r[0] = 0;
     for (i = n-1; i >= 0; i--) {
         r[j] |= (((sp_digit)a[i]) << s);
-        if (s >= 15U) {
-            r[j] &= 0x7fffff;
-            s = 23U - s;
+        if (s >= 21U) {
+            r[j] &= 0x1fffffff;
+            s = 29U - s;
             if (j + 1 >= size) {
                 break;
             }
@@ -109,7 +127,7 @@ static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
  */
 static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
 {
-#if DIGIT_BIT == 23
+#if DIGIT_BIT == 29
     int j;
 
     XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);
@@ -117,7 +135,7 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
     for (j = a->used; j < size; j++) {
         r[j] = 0;
     }
-#elif DIGIT_BIT > 23
+#elif DIGIT_BIT > 29
     int i;
     int j = 0;
     word32 s = 0;
@@ -125,16 +143,16 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
     r[0] = 0;
     for (i = 0; i < a->used && j < size; i++) {
         r[j] |= ((sp_digit)a->dp[i] << s);
-        r[j] &= 0x7fffff;
-        s = 23U - s;
+        r[j] &= 0x1fffffff;
+        s = 29U - s;
         if (j + 1 >= size) {
             break;
         }
         /* lint allow cast of mismatch word32 and mp_digit */
         r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
-        while ((s + 23U) <= (word32)DIGIT_BIT) {
-            s += 23U;
-            r[j] &= 0x7fffff;
+        while ((s + 29U) <= (word32)DIGIT_BIT) {
+            s += 29U;
+            r[j] &= 0x1fffffff;
             if (j + 1 >= size) {
                 break;
             }
@@ -160,12 +178,12 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
     r[0] = 0;
     for (i = 0; i < a->used && j < size; i++) {
         r[j] |= ((sp_digit)a->dp[i]) << s;
-        if (s + DIGIT_BIT >= 23) {
-            r[j] &= 0x7fffff;
+        if (s + DIGIT_BIT >= 29) {
+            r[j] &= 0x1fffffff;
             if (j + 1 >= size) {
                 break;
             }
-            s = 23 - s;
+            s = 29 - s;
             if (s == DIGIT_BIT) {
                 r[++j] = 0;
                 s = 0;
@@ -192,20 +210,20 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_2048_to_bin(sp_digit* r, byte* a)
+static void sp_2048_to_bin_72(sp_digit* r, byte* a)
 {
     int i;
     int j;
     int s = 0;
     int b;
 
-    for (i=0; i<89; i++) {
-        r[i+1] += r[i] >> 23;
-        r[i] &= 0x7fffff;
+    for (i=0; i<71; i++) {
+        r[i+1] += r[i] >> 29;
+        r[i] &= 0x1fffffff;
     }
     j = 2048 / 8 - 1;
     a[j] = 0;
-    for (i=0; i<90 && j>=0; i++) {
+    for (i=0; i<72 && j>=0; i++) {
         b = 0;
         /* lint allow cast of mismatch sp_digit and int */
         a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
@@ -213,14 +231,14 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
         if (j < 0) {
             break;
         }
-        while (b < 23) {
+        while (b < 29) {
             a[j--] = (byte)(r[i] >> b);
             b += 8;
             if (j < 0) {
                 break;
             }
         }
-        s = 8 - (b - 23);
+        s = 8 - (b - 29);
         if (j >= 0) {
             a[j] = 0;
         }
@@ -230,6 +248,70 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 29 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_36(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 35; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 32; i += 8) {
+        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
+        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
+        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
+        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
+        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
+        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
+        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
+        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
+    }
+    a[33] += a[32] >> 29; a[32] &= 0x1fffffff;
+    a[34] += a[33] >> 29; a[33] &= 0x1fffffff;
+    a[35] += a[34] >> 29; a[34] &= 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+/* Normalize the values in each word to 29 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_72(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 71; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 64; i += 8) {
+        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
+        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
+        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
+        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
+        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
+        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
+        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
+        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
+    }
+    a[65] += a[64] >> 29; a[64] &= 0x1fffffff;
+    a[66] += a[65] >> 29; a[65] &= 0x1fffffff;
+    a[67] += a[66] >> 29; a[66] &= 0x1fffffff;
+    a[68] += a[67] >> 29; a[67] &= 0x1fffffff;
+    a[69] += a[68] >> 29; a[68] &= 0x1fffffff;
+    a[70] += a[69] >> 29; a[69] &= 0x1fffffff;
+    a[71] += a[70] >> 29; a[70] &= 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -237,265 +319,178 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_15(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_12(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int64_t t0   = ((int64_t)a[ 0]) * b[ 0];
-    int64_t t1   = ((int64_t)a[ 0]) * b[ 1]
-                 + ((int64_t)a[ 1]) * b[ 0];
-    int64_t t2   = ((int64_t)a[ 0]) * b[ 2]
-                 + ((int64_t)a[ 1]) * b[ 1]
-                 + ((int64_t)a[ 2]) * b[ 0];
-    int64_t t3   = ((int64_t)a[ 0]) * b[ 3]
-                 + ((int64_t)a[ 1]) * b[ 2]
-                 + ((int64_t)a[ 2]) * b[ 1]
-                 + ((int64_t)a[ 3]) * b[ 0];
-    int64_t t4   = ((int64_t)a[ 0]) * b[ 4]
-                 + ((int64_t)a[ 1]) * b[ 3]
-                 + ((int64_t)a[ 2]) * b[ 2]
-                 + ((int64_t)a[ 3]) * b[ 1]
-                 + ((int64_t)a[ 4]) * b[ 0];
-    int64_t t5   = ((int64_t)a[ 0]) * b[ 5]
-                 + ((int64_t)a[ 1]) * b[ 4]
-                 + ((int64_t)a[ 2]) * b[ 3]
-                 + ((int64_t)a[ 3]) * b[ 2]
-                 + ((int64_t)a[ 4]) * b[ 1]
-                 + ((int64_t)a[ 5]) * b[ 0];
-    int64_t t6   = ((int64_t)a[ 0]) * b[ 6]
-                 + ((int64_t)a[ 1]) * b[ 5]
-                 + ((int64_t)a[ 2]) * b[ 4]
-                 + ((int64_t)a[ 3]) * b[ 3]
-                 + ((int64_t)a[ 4]) * b[ 2]
-                 + ((int64_t)a[ 5]) * b[ 1]
-                 + ((int64_t)a[ 6]) * b[ 0];
-    int64_t t7   = ((int64_t)a[ 0]) * b[ 7]
-                 + ((int64_t)a[ 1]) * b[ 6]
-                 + ((int64_t)a[ 2]) * b[ 5]
-                 + ((int64_t)a[ 3]) * b[ 4]
-                 + ((int64_t)a[ 4]) * b[ 3]
-                 + ((int64_t)a[ 5]) * b[ 2]
-                 + ((int64_t)a[ 6]) * b[ 1]
-                 + ((int64_t)a[ 7]) * b[ 0];
-    int64_t t8   = ((int64_t)a[ 0]) * b[ 8]
-                 + ((int64_t)a[ 1]) * b[ 7]
-                 + ((int64_t)a[ 2]) * b[ 6]
-                 + ((int64_t)a[ 3]) * b[ 5]
-                 + ((int64_t)a[ 4]) * b[ 4]
-                 + ((int64_t)a[ 5]) * b[ 3]
-                 + ((int64_t)a[ 6]) * b[ 2]
-                 + ((int64_t)a[ 7]) * b[ 1]
-                 + ((int64_t)a[ 8]) * b[ 0];
-    int64_t t9   = ((int64_t)a[ 0]) * b[ 9]
-                 + ((int64_t)a[ 1]) * b[ 8]
-                 + ((int64_t)a[ 2]) * b[ 7]
-                 + ((int64_t)a[ 3]) * b[ 6]
-                 + ((int64_t)a[ 4]) * b[ 5]
-                 + ((int64_t)a[ 5]) * b[ 4]
-                 + ((int64_t)a[ 6]) * b[ 3]
-                 + ((int64_t)a[ 7]) * b[ 2]
-                 + ((int64_t)a[ 8]) * b[ 1]
-                 + ((int64_t)a[ 9]) * b[ 0];
-    int64_t t10  = ((int64_t)a[ 0]) * b[10]
-                 + ((int64_t)a[ 1]) * b[ 9]
-                 + ((int64_t)a[ 2]) * b[ 8]
-                 + ((int64_t)a[ 3]) * b[ 7]
-                 + ((int64_t)a[ 4]) * b[ 6]
-                 + ((int64_t)a[ 5]) * b[ 5]
-                 + ((int64_t)a[ 6]) * b[ 4]
-                 + ((int64_t)a[ 7]) * b[ 3]
-                 + ((int64_t)a[ 8]) * b[ 2]
-                 + ((int64_t)a[ 9]) * b[ 1]
-                 + ((int64_t)a[10]) * b[ 0];
-    int64_t t11  = ((int64_t)a[ 0]) * b[11]
-                 + ((int64_t)a[ 1]) * b[10]
-                 + ((int64_t)a[ 2]) * b[ 9]
-                 + ((int64_t)a[ 3]) * b[ 8]
-                 + ((int64_t)a[ 4]) * b[ 7]
-                 + ((int64_t)a[ 5]) * b[ 6]
-                 + ((int64_t)a[ 6]) * b[ 5]
-                 + ((int64_t)a[ 7]) * b[ 4]
-                 + ((int64_t)a[ 8]) * b[ 3]
-                 + ((int64_t)a[ 9]) * b[ 2]
-                 + ((int64_t)a[10]) * b[ 1]
-                 + ((int64_t)a[11]) * b[ 0];
-    int64_t t12  = ((int64_t)a[ 0]) * b[12]
-                 + ((int64_t)a[ 1]) * b[11]
-                 + ((int64_t)a[ 2]) * b[10]
-                 + ((int64_t)a[ 3]) * b[ 9]
-                 + ((int64_t)a[ 4]) * b[ 8]
-                 + ((int64_t)a[ 5]) * b[ 7]
-                 + ((int64_t)a[ 6]) * b[ 6]
-                 + ((int64_t)a[ 7]) * b[ 5]
-                 + ((int64_t)a[ 8]) * b[ 4]
-                 + ((int64_t)a[ 9]) * b[ 3]
-                 + ((int64_t)a[10]) * b[ 2]
-                 + ((int64_t)a[11]) * b[ 1]
-                 + ((int64_t)a[12]) * b[ 0];
-    int64_t t13  = ((int64_t)a[ 0]) * b[13]
-                 + ((int64_t)a[ 1]) * b[12]
-                 + ((int64_t)a[ 2]) * b[11]
-                 + ((int64_t)a[ 3]) * b[10]
-                 + ((int64_t)a[ 4]) * b[ 9]
-                 + ((int64_t)a[ 5]) * b[ 8]
-                 + ((int64_t)a[ 6]) * b[ 7]
-                 + ((int64_t)a[ 7]) * b[ 6]
-                 + ((int64_t)a[ 8]) * b[ 5]
-                 + ((int64_t)a[ 9]) * b[ 4]
-                 + ((int64_t)a[10]) * b[ 3]
-                 + ((int64_t)a[11]) * b[ 2]
-                 + ((int64_t)a[12]) * b[ 1]
-                 + ((int64_t)a[13]) * b[ 0];
-    int64_t t14  = ((int64_t)a[ 0]) * b[14]
-                 + ((int64_t)a[ 1]) * b[13]
-                 + ((int64_t)a[ 2]) * b[12]
-                 + ((int64_t)a[ 3]) * b[11]
-                 + ((int64_t)a[ 4]) * b[10]
-                 + ((int64_t)a[ 5]) * b[ 9]
-                 + ((int64_t)a[ 6]) * b[ 8]
-                 + ((int64_t)a[ 7]) * b[ 7]
-                 + ((int64_t)a[ 8]) * b[ 6]
-                 + ((int64_t)a[ 9]) * b[ 5]
-                 + ((int64_t)a[10]) * b[ 4]
-                 + ((int64_t)a[11]) * b[ 3]
-                 + ((int64_t)a[12]) * b[ 2]
-                 + ((int64_t)a[13]) * b[ 1]
-                 + ((int64_t)a[14]) * b[ 0];
-    int64_t t15  = ((int64_t)a[ 1]) * b[14]
-                 + ((int64_t)a[ 2]) * b[13]
-                 + ((int64_t)a[ 3]) * b[12]
-                 + ((int64_t)a[ 4]) * b[11]
-                 + ((int64_t)a[ 5]) * b[10]
-                 + ((int64_t)a[ 6]) * b[ 9]
-                 + ((int64_t)a[ 7]) * b[ 8]
-                 + ((int64_t)a[ 8]) * b[ 7]
-                 + ((int64_t)a[ 9]) * b[ 6]
-                 + ((int64_t)a[10]) * b[ 5]
-                 + ((int64_t)a[11]) * b[ 4]
-                 + ((int64_t)a[12]) * b[ 3]
-                 + ((int64_t)a[13]) * b[ 2]
-                 + ((int64_t)a[14]) * b[ 1];
-    int64_t t16  = ((int64_t)a[ 2]) * b[14]
-                 + ((int64_t)a[ 3]) * b[13]
-                 + ((int64_t)a[ 4]) * b[12]
-                 + ((int64_t)a[ 5]) * b[11]
-                 + ((int64_t)a[ 6]) * b[10]
-                 + ((int64_t)a[ 7]) * b[ 9]
-                 + ((int64_t)a[ 8]) * b[ 8]
-                 + ((int64_t)a[ 9]) * b[ 7]
-                 + ((int64_t)a[10]) * b[ 6]
-                 + ((int64_t)a[11]) * b[ 5]
-                 + ((int64_t)a[12]) * b[ 4]
-                 + ((int64_t)a[13]) * b[ 3]
-                 + ((int64_t)a[14]) * b[ 2];
-    int64_t t17  = ((int64_t)a[ 3]) * b[14]
-                 + ((int64_t)a[ 4]) * b[13]
-                 + ((int64_t)a[ 5]) * b[12]
-                 + ((int64_t)a[ 6]) * b[11]
-                 + ((int64_t)a[ 7]) * b[10]
-                 + ((int64_t)a[ 8]) * b[ 9]
-                 + ((int64_t)a[ 9]) * b[ 8]
-                 + ((int64_t)a[10]) * b[ 7]
-                 + ((int64_t)a[11]) * b[ 6]
-                 + ((int64_t)a[12]) * b[ 5]
-                 + ((int64_t)a[13]) * b[ 4]
-                 + ((int64_t)a[14]) * b[ 3];
-    int64_t t18  = ((int64_t)a[ 4]) * b[14]
-                 + ((int64_t)a[ 5]) * b[13]
-                 + ((int64_t)a[ 6]) * b[12]
-                 + ((int64_t)a[ 7]) * b[11]
-                 + ((int64_t)a[ 8]) * b[10]
-                 + ((int64_t)a[ 9]) * b[ 9]
-                 + ((int64_t)a[10]) * b[ 8]
-                 + ((int64_t)a[11]) * b[ 7]
-                 + ((int64_t)a[12]) * b[ 6]
-                 + ((int64_t)a[13]) * b[ 5]
-                 + ((int64_t)a[14]) * b[ 4];
-    int64_t t19  = ((int64_t)a[ 5]) * b[14]
-                 + ((int64_t)a[ 6]) * b[13]
-                 + ((int64_t)a[ 7]) * b[12]
-                 + ((int64_t)a[ 8]) * b[11]
-                 + ((int64_t)a[ 9]) * b[10]
-                 + ((int64_t)a[10]) * b[ 9]
-                 + ((int64_t)a[11]) * b[ 8]
-                 + ((int64_t)a[12]) * b[ 7]
-                 + ((int64_t)a[13]) * b[ 6]
-                 + ((int64_t)a[14]) * b[ 5];
-    int64_t t20  = ((int64_t)a[ 6]) * b[14]
-                 + ((int64_t)a[ 7]) * b[13]
-                 + ((int64_t)a[ 8]) * b[12]
-                 + ((int64_t)a[ 9]) * b[11]
-                 + ((int64_t)a[10]) * b[10]
-                 + ((int64_t)a[11]) * b[ 9]
-                 + ((int64_t)a[12]) * b[ 8]
-                 + ((int64_t)a[13]) * b[ 7]
-                 + ((int64_t)a[14]) * b[ 6];
-    int64_t t21  = ((int64_t)a[ 7]) * b[14]
-                 + ((int64_t)a[ 8]) * b[13]
-                 + ((int64_t)a[ 9]) * b[12]
-                 + ((int64_t)a[10]) * b[11]
-                 + ((int64_t)a[11]) * b[10]
-                 + ((int64_t)a[12]) * b[ 9]
-                 + ((int64_t)a[13]) * b[ 8]
-                 + ((int64_t)a[14]) * b[ 7];
-    int64_t t22  = ((int64_t)a[ 8]) * b[14]
-                 + ((int64_t)a[ 9]) * b[13]
-                 + ((int64_t)a[10]) * b[12]
-                 + ((int64_t)a[11]) * b[11]
-                 + ((int64_t)a[12]) * b[10]
-                 + ((int64_t)a[13]) * b[ 9]
-                 + ((int64_t)a[14]) * b[ 8];
-    int64_t t23  = ((int64_t)a[ 9]) * b[14]
-                 + ((int64_t)a[10]) * b[13]
-                 + ((int64_t)a[11]) * b[12]
-                 + ((int64_t)a[12]) * b[11]
-                 + ((int64_t)a[13]) * b[10]
-                 + ((int64_t)a[14]) * b[ 9];
-    int64_t t24  = ((int64_t)a[10]) * b[14]
-                 + ((int64_t)a[11]) * b[13]
-                 + ((int64_t)a[12]) * b[12]
-                 + ((int64_t)a[13]) * b[11]
-                 + ((int64_t)a[14]) * b[10];
-    int64_t t25  = ((int64_t)a[11]) * b[14]
-                 + ((int64_t)a[12]) * b[13]
-                 + ((int64_t)a[13]) * b[12]
-                 + ((int64_t)a[14]) * b[11];
-    int64_t t26  = ((int64_t)a[12]) * b[14]
-                 + ((int64_t)a[13]) * b[13]
-                 + ((int64_t)a[14]) * b[12];
-    int64_t t27  = ((int64_t)a[13]) * b[14]
-                 + ((int64_t)a[14]) * b[13];
-    int64_t t28  = ((int64_t)a[14]) * b[14];
+    sp_uint64 t0   = ((sp_uint64)a[ 0]) * b[ 0];
+    sp_uint64 t1   = ((sp_uint64)a[ 0]) * b[ 1]
+                 + ((sp_uint64)a[ 1]) * b[ 0];
+    sp_uint64 t2   = ((sp_uint64)a[ 0]) * b[ 2]
+                 + ((sp_uint64)a[ 1]) * b[ 1]
+                 + ((sp_uint64)a[ 2]) * b[ 0];
+    sp_uint64 t3   = ((sp_uint64)a[ 0]) * b[ 3]
+                 + ((sp_uint64)a[ 1]) * b[ 2]
+                 + ((sp_uint64)a[ 2]) * b[ 1]
+                 + ((sp_uint64)a[ 3]) * b[ 0];
+    sp_uint64 t4   = ((sp_uint64)a[ 0]) * b[ 4]
+                 + ((sp_uint64)a[ 1]) * b[ 3]
+                 + ((sp_uint64)a[ 2]) * b[ 2]
+                 + ((sp_uint64)a[ 3]) * b[ 1]
+                 + ((sp_uint64)a[ 4]) * b[ 0];
+    sp_uint64 t5   = ((sp_uint64)a[ 0]) * b[ 5]
+                 + ((sp_uint64)a[ 1]) * b[ 4]
+                 + ((sp_uint64)a[ 2]) * b[ 3]
+                 + ((sp_uint64)a[ 3]) * b[ 2]
+                 + ((sp_uint64)a[ 4]) * b[ 1]
+                 + ((sp_uint64)a[ 5]) * b[ 0];
+    sp_uint64 t6   = ((sp_uint64)a[ 0]) * b[ 6]
+                 + ((sp_uint64)a[ 1]) * b[ 5]
+                 + ((sp_uint64)a[ 2]) * b[ 4]
+                 + ((sp_uint64)a[ 3]) * b[ 3]
+                 + ((sp_uint64)a[ 4]) * b[ 2]
+                 + ((sp_uint64)a[ 5]) * b[ 1]
+                 + ((sp_uint64)a[ 6]) * b[ 0];
+    sp_uint64 t7   = ((sp_uint64)a[ 0]) * b[ 7]
+                 + ((sp_uint64)a[ 1]) * b[ 6]
+                 + ((sp_uint64)a[ 2]) * b[ 5]
+                 + ((sp_uint64)a[ 3]) * b[ 4]
+                 + ((sp_uint64)a[ 4]) * b[ 3]
+                 + ((sp_uint64)a[ 5]) * b[ 2]
+                 + ((sp_uint64)a[ 6]) * b[ 1]
+                 + ((sp_uint64)a[ 7]) * b[ 0];
+    sp_uint64 t8   = ((sp_uint64)a[ 0]) * b[ 8]
+                 + ((sp_uint64)a[ 1]) * b[ 7]
+                 + ((sp_uint64)a[ 2]) * b[ 6]
+                 + ((sp_uint64)a[ 3]) * b[ 5]
+                 + ((sp_uint64)a[ 4]) * b[ 4]
+                 + ((sp_uint64)a[ 5]) * b[ 3]
+                 + ((sp_uint64)a[ 6]) * b[ 2]
+                 + ((sp_uint64)a[ 7]) * b[ 1]
+                 + ((sp_uint64)a[ 8]) * b[ 0];
+    sp_uint64 t9   = ((sp_uint64)a[ 0]) * b[ 9]
+                 + ((sp_uint64)a[ 1]) * b[ 8]
+                 + ((sp_uint64)a[ 2]) * b[ 7]
+                 + ((sp_uint64)a[ 3]) * b[ 6]
+                 + ((sp_uint64)a[ 4]) * b[ 5]
+                 + ((sp_uint64)a[ 5]) * b[ 4]
+                 + ((sp_uint64)a[ 6]) * b[ 3]
+                 + ((sp_uint64)a[ 7]) * b[ 2]
+                 + ((sp_uint64)a[ 8]) * b[ 1]
+                 + ((sp_uint64)a[ 9]) * b[ 0];
+    sp_uint64 t10  = ((sp_uint64)a[ 0]) * b[10]
+                 + ((sp_uint64)a[ 1]) * b[ 9]
+                 + ((sp_uint64)a[ 2]) * b[ 8]
+                 + ((sp_uint64)a[ 3]) * b[ 7]
+                 + ((sp_uint64)a[ 4]) * b[ 6]
+                 + ((sp_uint64)a[ 5]) * b[ 5]
+                 + ((sp_uint64)a[ 6]) * b[ 4]
+                 + ((sp_uint64)a[ 7]) * b[ 3]
+                 + ((sp_uint64)a[ 8]) * b[ 2]
+                 + ((sp_uint64)a[ 9]) * b[ 1]
+                 + ((sp_uint64)a[10]) * b[ 0];
+    sp_uint64 t11  = ((sp_uint64)a[ 0]) * b[11]
+                 + ((sp_uint64)a[ 1]) * b[10]
+                 + ((sp_uint64)a[ 2]) * b[ 9]
+                 + ((sp_uint64)a[ 3]) * b[ 8]
+                 + ((sp_uint64)a[ 4]) * b[ 7]
+                 + ((sp_uint64)a[ 5]) * b[ 6]
+                 + ((sp_uint64)a[ 6]) * b[ 5]
+                 + ((sp_uint64)a[ 7]) * b[ 4]
+                 + ((sp_uint64)a[ 8]) * b[ 3]
+                 + ((sp_uint64)a[ 9]) * b[ 2]
+                 + ((sp_uint64)a[10]) * b[ 1]
+                 + ((sp_uint64)a[11]) * b[ 0];
+    sp_uint64 t12  = ((sp_uint64)a[ 1]) * b[11]
+                 + ((sp_uint64)a[ 2]) * b[10]
+                 + ((sp_uint64)a[ 3]) * b[ 9]
+                 + ((sp_uint64)a[ 4]) * b[ 8]
+                 + ((sp_uint64)a[ 5]) * b[ 7]
+                 + ((sp_uint64)a[ 6]) * b[ 6]
+                 + ((sp_uint64)a[ 7]) * b[ 5]
+                 + ((sp_uint64)a[ 8]) * b[ 4]
+                 + ((sp_uint64)a[ 9]) * b[ 3]
+                 + ((sp_uint64)a[10]) * b[ 2]
+                 + ((sp_uint64)a[11]) * b[ 1];
+    sp_uint64 t13  = ((sp_uint64)a[ 2]) * b[11]
+                 + ((sp_uint64)a[ 3]) * b[10]
+                 + ((sp_uint64)a[ 4]) * b[ 9]
+                 + ((sp_uint64)a[ 5]) * b[ 8]
+                 + ((sp_uint64)a[ 6]) * b[ 7]
+                 + ((sp_uint64)a[ 7]) * b[ 6]
+                 + ((sp_uint64)a[ 8]) * b[ 5]
+                 + ((sp_uint64)a[ 9]) * b[ 4]
+                 + ((sp_uint64)a[10]) * b[ 3]
+                 + ((sp_uint64)a[11]) * b[ 2];
+    sp_uint64 t14  = ((sp_uint64)a[ 3]) * b[11]
+                 + ((sp_uint64)a[ 4]) * b[10]
+                 + ((sp_uint64)a[ 5]) * b[ 9]
+                 + ((sp_uint64)a[ 6]) * b[ 8]
+                 + ((sp_uint64)a[ 7]) * b[ 7]
+                 + ((sp_uint64)a[ 8]) * b[ 6]
+                 + ((sp_uint64)a[ 9]) * b[ 5]
+                 + ((sp_uint64)a[10]) * b[ 4]
+                 + ((sp_uint64)a[11]) * b[ 3];
+    sp_uint64 t15  = ((sp_uint64)a[ 4]) * b[11]
+                 + ((sp_uint64)a[ 5]) * b[10]
+                 + ((sp_uint64)a[ 6]) * b[ 9]
+                 + ((sp_uint64)a[ 7]) * b[ 8]
+                 + ((sp_uint64)a[ 8]) * b[ 7]
+                 + ((sp_uint64)a[ 9]) * b[ 6]
+                 + ((sp_uint64)a[10]) * b[ 5]
+                 + ((sp_uint64)a[11]) * b[ 4];
+    sp_uint64 t16  = ((sp_uint64)a[ 5]) * b[11]
+                 + ((sp_uint64)a[ 6]) * b[10]
+                 + ((sp_uint64)a[ 7]) * b[ 9]
+                 + ((sp_uint64)a[ 8]) * b[ 8]
+                 + ((sp_uint64)a[ 9]) * b[ 7]
+                 + ((sp_uint64)a[10]) * b[ 6]
+                 + ((sp_uint64)a[11]) * b[ 5];
+    sp_uint64 t17  = ((sp_uint64)a[ 6]) * b[11]
+                 + ((sp_uint64)a[ 7]) * b[10]
+                 + ((sp_uint64)a[ 8]) * b[ 9]
+                 + ((sp_uint64)a[ 9]) * b[ 8]
+                 + ((sp_uint64)a[10]) * b[ 7]
+                 + ((sp_uint64)a[11]) * b[ 6];
+    sp_uint64 t18  = ((sp_uint64)a[ 7]) * b[11]
+                 + ((sp_uint64)a[ 8]) * b[10]
+                 + ((sp_uint64)a[ 9]) * b[ 9]
+                 + ((sp_uint64)a[10]) * b[ 8]
+                 + ((sp_uint64)a[11]) * b[ 7];
+    sp_uint64 t19  = ((sp_uint64)a[ 8]) * b[11]
+                 + ((sp_uint64)a[ 9]) * b[10]
+                 + ((sp_uint64)a[10]) * b[ 9]
+                 + ((sp_uint64)a[11]) * b[ 8];
+    sp_uint64 t20  = ((sp_uint64)a[ 9]) * b[11]
+                 + ((sp_uint64)a[10]) * b[10]
+                 + ((sp_uint64)a[11]) * b[ 9];
+    sp_uint64 t21  = ((sp_uint64)a[10]) * b[11]
+                 + ((sp_uint64)a[11]) * b[10];
+    sp_uint64 t22  = ((sp_uint64)a[11]) * b[11];
 
-    t1   += t0  >> 23; r[ 0] = t0  & 0x7fffff;
-    t2   += t1  >> 23; r[ 1] = t1  & 0x7fffff;
-    t3   += t2  >> 23; r[ 2] = t2  & 0x7fffff;
-    t4   += t3  >> 23; r[ 3] = t3  & 0x7fffff;
-    t5   += t4  >> 23; r[ 4] = t4  & 0x7fffff;
-    t6   += t5  >> 23; r[ 5] = t5  & 0x7fffff;
-    t7   += t6  >> 23; r[ 6] = t6  & 0x7fffff;
-    t8   += t7  >> 23; r[ 7] = t7  & 0x7fffff;
-    t9   += t8  >> 23; r[ 8] = t8  & 0x7fffff;
-    t10  += t9  >> 23; r[ 9] = t9  & 0x7fffff;
-    t11  += t10 >> 23; r[10] = t10 & 0x7fffff;
-    t12  += t11 >> 23; r[11] = t11 & 0x7fffff;
-    t13  += t12 >> 23; r[12] = t12 & 0x7fffff;
-    t14  += t13 >> 23; r[13] = t13 & 0x7fffff;
-    t15  += t14 >> 23; r[14] = t14 & 0x7fffff;
-    t16  += t15 >> 23; r[15] = t15 & 0x7fffff;
-    t17  += t16 >> 23; r[16] = t16 & 0x7fffff;
-    t18  += t17 >> 23; r[17] = t17 & 0x7fffff;
-    t19  += t18 >> 23; r[18] = t18 & 0x7fffff;
-    t20  += t19 >> 23; r[19] = t19 & 0x7fffff;
-    t21  += t20 >> 23; r[20] = t20 & 0x7fffff;
-    t22  += t21 >> 23; r[21] = t21 & 0x7fffff;
-    t23  += t22 >> 23; r[22] = t22 & 0x7fffff;
-    t24  += t23 >> 23; r[23] = t23 & 0x7fffff;
-    t25  += t24 >> 23; r[24] = t24 & 0x7fffff;
-    t26  += t25 >> 23; r[25] = t25 & 0x7fffff;
-    t27  += t26 >> 23; r[26] = t26 & 0x7fffff;
-    t28  += t27 >> 23; r[27] = t27 & 0x7fffff;
-    r[29] = (sp_digit)(t28 >> 23);
-                       r[28] = t28 & 0x7fffff;
+    t1   += t0  >> 29; r[ 0] = t0  & 0x1fffffff;
+    t2   += t1  >> 29; r[ 1] = t1  & 0x1fffffff;
+    t3   += t2  >> 29; r[ 2] = t2  & 0x1fffffff;
+    t4   += t3  >> 29; r[ 3] = t3  & 0x1fffffff;
+    t5   += t4  >> 29; r[ 4] = t4  & 0x1fffffff;
+    t6   += t5  >> 29; r[ 5] = t5  & 0x1fffffff;
+    t7   += t6  >> 29; r[ 6] = t6  & 0x1fffffff;
+    t8   += t7  >> 29; r[ 7] = t7  & 0x1fffffff;
+    t9   += t8  >> 29; r[ 8] = t8  & 0x1fffffff;
+    t10  += t9  >> 29; r[ 9] = t9  & 0x1fffffff;
+    t11  += t10 >> 29; r[10] = t10 & 0x1fffffff;
+    t12  += t11 >> 29; r[11] = t11 & 0x1fffffff;
+    t13  += t12 >> 29; r[12] = t12 & 0x1fffffff;
+    t14  += t13 >> 29; r[13] = t13 & 0x1fffffff;
+    t15  += t14 >> 29; r[14] = t14 & 0x1fffffff;
+    t16  += t15 >> 29; r[15] = t15 & 0x1fffffff;
+    t17  += t16 >> 29; r[16] = t16 & 0x1fffffff;
+    t18  += t17 >> 29; r[17] = t17 & 0x1fffffff;
+    t19  += t18 >> 29; r[18] = t18 & 0x1fffffff;
+    t20  += t19 >> 29; r[19] = t19 & 0x1fffffff;
+    t21  += t20 >> 29; r[20] = t20 & 0x1fffffff;
+    t22  += t21 >> 29; r[21] = t21 & 0x1fffffff;
+    r[23] = (sp_digit)(t22 >> 29);
+                       r[22] = t22 & 0x1fffffff;
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -503,159 +498,111 @@ SP_NOINLINE static void sp_2048_mul_15(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_15(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_12(sp_digit* r, const sp_digit* a)
 {
-    int64_t t0   =  ((int64_t)a[ 0]) * a[ 0];
-    int64_t t1   = (((int64_t)a[ 0]) * a[ 1]) * 2;
-    int64_t t2   = (((int64_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int64_t)a[ 1]) * a[ 1];
-    int64_t t3   = (((int64_t)a[ 0]) * a[ 3]
-                 +  ((int64_t)a[ 1]) * a[ 2]) * 2;
-    int64_t t4   = (((int64_t)a[ 0]) * a[ 4]
-                 +  ((int64_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int64_t)a[ 2]) * a[ 2];
-    int64_t t5   = (((int64_t)a[ 0]) * a[ 5]
-                 +  ((int64_t)a[ 1]) * a[ 4]
-                 +  ((int64_t)a[ 2]) * a[ 3]) * 2;
-    int64_t t6   = (((int64_t)a[ 0]) * a[ 6]
-                 +  ((int64_t)a[ 1]) * a[ 5]
-                 +  ((int64_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int64_t)a[ 3]) * a[ 3];
-    int64_t t7   = (((int64_t)a[ 0]) * a[ 7]
-                 +  ((int64_t)a[ 1]) * a[ 6]
-                 +  ((int64_t)a[ 2]) * a[ 5]
-                 +  ((int64_t)a[ 3]) * a[ 4]) * 2;
-    int64_t t8   = (((int64_t)a[ 0]) * a[ 8]
-                 +  ((int64_t)a[ 1]) * a[ 7]
-                 +  ((int64_t)a[ 2]) * a[ 6]
-                 +  ((int64_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int64_t)a[ 4]) * a[ 4];
-    int64_t t9   = (((int64_t)a[ 0]) * a[ 9]
-                 +  ((int64_t)a[ 1]) * a[ 8]
-                 +  ((int64_t)a[ 2]) * a[ 7]
-                 +  ((int64_t)a[ 3]) * a[ 6]
-                 +  ((int64_t)a[ 4]) * a[ 5]) * 2;
-    int64_t t10  = (((int64_t)a[ 0]) * a[10]
-                 +  ((int64_t)a[ 1]) * a[ 9]
-                 +  ((int64_t)a[ 2]) * a[ 8]
-                 +  ((int64_t)a[ 3]) * a[ 7]
-                 +  ((int64_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int64_t)a[ 5]) * a[ 5];
-    int64_t t11  = (((int64_t)a[ 0]) * a[11]
-                 +  ((int64_t)a[ 1]) * a[10]
-                 +  ((int64_t)a[ 2]) * a[ 9]
-                 +  ((int64_t)a[ 3]) * a[ 8]
-                 +  ((int64_t)a[ 4]) * a[ 7]
-                 +  ((int64_t)a[ 5]) * a[ 6]) * 2;
-    int64_t t12  = (((int64_t)a[ 0]) * a[12]
-                 +  ((int64_t)a[ 1]) * a[11]
-                 +  ((int64_t)a[ 2]) * a[10]
-                 +  ((int64_t)a[ 3]) * a[ 9]
-                 +  ((int64_t)a[ 4]) * a[ 8]
-                 +  ((int64_t)a[ 5]) * a[ 7]) * 2
-                 +  ((int64_t)a[ 6]) * a[ 6];
-    int64_t t13  = (((int64_t)a[ 0]) * a[13]
-                 +  ((int64_t)a[ 1]) * a[12]
-                 +  ((int64_t)a[ 2]) * a[11]
-                 +  ((int64_t)a[ 3]) * a[10]
-                 +  ((int64_t)a[ 4]) * a[ 9]
-                 +  ((int64_t)a[ 5]) * a[ 8]
-                 +  ((int64_t)a[ 6]) * a[ 7]) * 2;
-    int64_t t14  = (((int64_t)a[ 0]) * a[14]
-                 +  ((int64_t)a[ 1]) * a[13]
-                 +  ((int64_t)a[ 2]) * a[12]
-                 +  ((int64_t)a[ 3]) * a[11]
-                 +  ((int64_t)a[ 4]) * a[10]
-                 +  ((int64_t)a[ 5]) * a[ 9]
-                 +  ((int64_t)a[ 6]) * a[ 8]) * 2
-                 +  ((int64_t)a[ 7]) * a[ 7];
-    int64_t t15  = (((int64_t)a[ 1]) * a[14]
-                 +  ((int64_t)a[ 2]) * a[13]
-                 +  ((int64_t)a[ 3]) * a[12]
-                 +  ((int64_t)a[ 4]) * a[11]
-                 +  ((int64_t)a[ 5]) * a[10]
-                 +  ((int64_t)a[ 6]) * a[ 9]
-                 +  ((int64_t)a[ 7]) * a[ 8]) * 2;
-    int64_t t16  = (((int64_t)a[ 2]) * a[14]
-                 +  ((int64_t)a[ 3]) * a[13]
-                 +  ((int64_t)a[ 4]) * a[12]
-                 +  ((int64_t)a[ 5]) * a[11]
-                 +  ((int64_t)a[ 6]) * a[10]
-                 +  ((int64_t)a[ 7]) * a[ 9]) * 2
-                 +  ((int64_t)a[ 8]) * a[ 8];
-    int64_t t17  = (((int64_t)a[ 3]) * a[14]
-                 +  ((int64_t)a[ 4]) * a[13]
-                 +  ((int64_t)a[ 5]) * a[12]
-                 +  ((int64_t)a[ 6]) * a[11]
-                 +  ((int64_t)a[ 7]) * a[10]
-                 +  ((int64_t)a[ 8]) * a[ 9]) * 2;
-    int64_t t18  = (((int64_t)a[ 4]) * a[14]
-                 +  ((int64_t)a[ 5]) * a[13]
-                 +  ((int64_t)a[ 6]) * a[12]
-                 +  ((int64_t)a[ 7]) * a[11]
-                 +  ((int64_t)a[ 8]) * a[10]) * 2
-                 +  ((int64_t)a[ 9]) * a[ 9];
-    int64_t t19  = (((int64_t)a[ 5]) * a[14]
-                 +  ((int64_t)a[ 6]) * a[13]
-                 +  ((int64_t)a[ 7]) * a[12]
-                 +  ((int64_t)a[ 8]) * a[11]
-                 +  ((int64_t)a[ 9]) * a[10]) * 2;
-    int64_t t20  = (((int64_t)a[ 6]) * a[14]
-                 +  ((int64_t)a[ 7]) * a[13]
-                 +  ((int64_t)a[ 8]) * a[12]
-                 +  ((int64_t)a[ 9]) * a[11]) * 2
-                 +  ((int64_t)a[10]) * a[10];
-    int64_t t21  = (((int64_t)a[ 7]) * a[14]
-                 +  ((int64_t)a[ 8]) * a[13]
-                 +  ((int64_t)a[ 9]) * a[12]
-                 +  ((int64_t)a[10]) * a[11]) * 2;
-    int64_t t22  = (((int64_t)a[ 8]) * a[14]
-                 +  ((int64_t)a[ 9]) * a[13]
-                 +  ((int64_t)a[10]) * a[12]) * 2
-                 +  ((int64_t)a[11]) * a[11];
-    int64_t t23  = (((int64_t)a[ 9]) * a[14]
-                 +  ((int64_t)a[10]) * a[13]
-                 +  ((int64_t)a[11]) * a[12]) * 2;
-    int64_t t24  = (((int64_t)a[10]) * a[14]
-                 +  ((int64_t)a[11]) * a[13]) * 2
-                 +  ((int64_t)a[12]) * a[12];
-    int64_t t25  = (((int64_t)a[11]) * a[14]
-                 +  ((int64_t)a[12]) * a[13]) * 2;
-    int64_t t26  = (((int64_t)a[12]) * a[14]) * 2
-                 +  ((int64_t)a[13]) * a[13];
-    int64_t t27  = (((int64_t)a[13]) * a[14]) * 2;
-    int64_t t28  =  ((int64_t)a[14]) * a[14];
+    sp_uint64 t0   =  ((sp_uint64)a[ 0]) * a[ 0];
+    sp_uint64 t1   = (((sp_uint64)a[ 0]) * a[ 1]) * 2;
+    sp_uint64 t2   = (((sp_uint64)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_uint64)a[ 1]) * a[ 1];
+    sp_uint64 t3   = (((sp_uint64)a[ 0]) * a[ 3]
+                 +  ((sp_uint64)a[ 1]) * a[ 2]) * 2;
+    sp_uint64 t4   = (((sp_uint64)a[ 0]) * a[ 4]
+                 +  ((sp_uint64)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_uint64)a[ 2]) * a[ 2];
+    sp_uint64 t5   = (((sp_uint64)a[ 0]) * a[ 5]
+                 +  ((sp_uint64)a[ 1]) * a[ 4]
+                 +  ((sp_uint64)a[ 2]) * a[ 3]) * 2;
+    sp_uint64 t6   = (((sp_uint64)a[ 0]) * a[ 6]
+                 +  ((sp_uint64)a[ 1]) * a[ 5]
+                 +  ((sp_uint64)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_uint64)a[ 3]) * a[ 3];
+    sp_uint64 t7   = (((sp_uint64)a[ 0]) * a[ 7]
+                 +  ((sp_uint64)a[ 1]) * a[ 6]
+                 +  ((sp_uint64)a[ 2]) * a[ 5]
+                 +  ((sp_uint64)a[ 3]) * a[ 4]) * 2;
+    sp_uint64 t8   = (((sp_uint64)a[ 0]) * a[ 8]
+                 +  ((sp_uint64)a[ 1]) * a[ 7]
+                 +  ((sp_uint64)a[ 2]) * a[ 6]
+                 +  ((sp_uint64)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_uint64)a[ 4]) * a[ 4];
+    sp_uint64 t9   = (((sp_uint64)a[ 0]) * a[ 9]
+                 +  ((sp_uint64)a[ 1]) * a[ 8]
+                 +  ((sp_uint64)a[ 2]) * a[ 7]
+                 +  ((sp_uint64)a[ 3]) * a[ 6]
+                 +  ((sp_uint64)a[ 4]) * a[ 5]) * 2;
+    sp_uint64 t10  = (((sp_uint64)a[ 0]) * a[10]
+                 +  ((sp_uint64)a[ 1]) * a[ 9]
+                 +  ((sp_uint64)a[ 2]) * a[ 8]
+                 +  ((sp_uint64)a[ 3]) * a[ 7]
+                 +  ((sp_uint64)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_uint64)a[ 5]) * a[ 5];
+    sp_uint64 t11  = (((sp_uint64)a[ 0]) * a[11]
+                 +  ((sp_uint64)a[ 1]) * a[10]
+                 +  ((sp_uint64)a[ 2]) * a[ 9]
+                 +  ((sp_uint64)a[ 3]) * a[ 8]
+                 +  ((sp_uint64)a[ 4]) * a[ 7]
+                 +  ((sp_uint64)a[ 5]) * a[ 6]) * 2;
+    sp_uint64 t12  = (((sp_uint64)a[ 1]) * a[11]
+                 +  ((sp_uint64)a[ 2]) * a[10]
+                 +  ((sp_uint64)a[ 3]) * a[ 9]
+                 +  ((sp_uint64)a[ 4]) * a[ 8]
+                 +  ((sp_uint64)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_uint64)a[ 6]) * a[ 6];
+    sp_uint64 t13  = (((sp_uint64)a[ 2]) * a[11]
+                 +  ((sp_uint64)a[ 3]) * a[10]
+                 +  ((sp_uint64)a[ 4]) * a[ 9]
+                 +  ((sp_uint64)a[ 5]) * a[ 8]
+                 +  ((sp_uint64)a[ 6]) * a[ 7]) * 2;
+    sp_uint64 t14  = (((sp_uint64)a[ 3]) * a[11]
+                 +  ((sp_uint64)a[ 4]) * a[10]
+                 +  ((sp_uint64)a[ 5]) * a[ 9]
+                 +  ((sp_uint64)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_uint64)a[ 7]) * a[ 7];
+    sp_uint64 t15  = (((sp_uint64)a[ 4]) * a[11]
+                 +  ((sp_uint64)a[ 5]) * a[10]
+                 +  ((sp_uint64)a[ 6]) * a[ 9]
+                 +  ((sp_uint64)a[ 7]) * a[ 8]) * 2;
+    sp_uint64 t16  = (((sp_uint64)a[ 5]) * a[11]
+                 +  ((sp_uint64)a[ 6]) * a[10]
+                 +  ((sp_uint64)a[ 7]) * a[ 9]) * 2
+                 +  ((sp_uint64)a[ 8]) * a[ 8];
+    sp_uint64 t17  = (((sp_uint64)a[ 6]) * a[11]
+                 +  ((sp_uint64)a[ 7]) * a[10]
+                 +  ((sp_uint64)a[ 8]) * a[ 9]) * 2;
+    sp_uint64 t18  = (((sp_uint64)a[ 7]) * a[11]
+                 +  ((sp_uint64)a[ 8]) * a[10]) * 2
+                 +  ((sp_uint64)a[ 9]) * a[ 9];
+    sp_uint64 t19  = (((sp_uint64)a[ 8]) * a[11]
+                 +  ((sp_uint64)a[ 9]) * a[10]) * 2;
+    sp_uint64 t20  = (((sp_uint64)a[ 9]) * a[11]) * 2
+                 +  ((sp_uint64)a[10]) * a[10];
+    sp_uint64 t21  = (((sp_uint64)a[10]) * a[11]) * 2;
+    sp_uint64 t22  =  ((sp_uint64)a[11]) * a[11];
 
-    t1   += t0  >> 23; r[ 0] = t0  & 0x7fffff;
-    t2   += t1  >> 23; r[ 1] = t1  & 0x7fffff;
-    t3   += t2  >> 23; r[ 2] = t2  & 0x7fffff;
-    t4   += t3  >> 23; r[ 3] = t3  & 0x7fffff;
-    t5   += t4  >> 23; r[ 4] = t4  & 0x7fffff;
-    t6   += t5  >> 23; r[ 5] = t5  & 0x7fffff;
-    t7   += t6  >> 23; r[ 6] = t6  & 0x7fffff;
-    t8   += t7  >> 23; r[ 7] = t7  & 0x7fffff;
-    t9   += t8  >> 23; r[ 8] = t8  & 0x7fffff;
-    t10  += t9  >> 23; r[ 9] = t9  & 0x7fffff;
-    t11  += t10 >> 23; r[10] = t10 & 0x7fffff;
-    t12  += t11 >> 23; r[11] = t11 & 0x7fffff;
-    t13  += t12 >> 23; r[12] = t12 & 0x7fffff;
-    t14  += t13 >> 23; r[13] = t13 & 0x7fffff;
-    t15  += t14 >> 23; r[14] = t14 & 0x7fffff;
-    t16  += t15 >> 23; r[15] = t15 & 0x7fffff;
-    t17  += t16 >> 23; r[16] = t16 & 0x7fffff;
-    t18  += t17 >> 23; r[17] = t17 & 0x7fffff;
-    t19  += t18 >> 23; r[18] = t18 & 0x7fffff;
-    t20  += t19 >> 23; r[19] = t19 & 0x7fffff;
-    t21  += t20 >> 23; r[20] = t20 & 0x7fffff;
-    t22  += t21 >> 23; r[21] = t21 & 0x7fffff;
-    t23  += t22 >> 23; r[22] = t22 & 0x7fffff;
-    t24  += t23 >> 23; r[23] = t23 & 0x7fffff;
-    t25  += t24 >> 23; r[24] = t24 & 0x7fffff;
-    t26  += t25 >> 23; r[25] = t25 & 0x7fffff;
-    t27  += t26 >> 23; r[26] = t26 & 0x7fffff;
-    t28  += t27 >> 23; r[27] = t27 & 0x7fffff;
-    r[29] = (sp_digit)(t28 >> 23);
-                       r[28] = t28 & 0x7fffff;
+    t1   += t0  >> 29; r[ 0] = t0  & 0x1fffffff;
+    t2   += t1  >> 29; r[ 1] = t1  & 0x1fffffff;
+    t3   += t2  >> 29; r[ 2] = t2  & 0x1fffffff;
+    t4   += t3  >> 29; r[ 3] = t3  & 0x1fffffff;
+    t5   += t4  >> 29; r[ 4] = t4  & 0x1fffffff;
+    t6   += t5  >> 29; r[ 5] = t5  & 0x1fffffff;
+    t7   += t6  >> 29; r[ 6] = t6  & 0x1fffffff;
+    t8   += t7  >> 29; r[ 7] = t7  & 0x1fffffff;
+    t9   += t8  >> 29; r[ 8] = t8  & 0x1fffffff;
+    t10  += t9  >> 29; r[ 9] = t9  & 0x1fffffff;
+    t11  += t10 >> 29; r[10] = t10 & 0x1fffffff;
+    t12  += t11 >> 29; r[11] = t11 & 0x1fffffff;
+    t13  += t12 >> 29; r[12] = t12 & 0x1fffffff;
+    t14  += t13 >> 29; r[13] = t13 & 0x1fffffff;
+    t15  += t14 >> 29; r[14] = t14 & 0x1fffffff;
+    t16  += t15 >> 29; r[15] = t15 & 0x1fffffff;
+    t17  += t16 >> 29; r[16] = t16 & 0x1fffffff;
+    t18  += t17 >> 29; r[17] = t17 & 0x1fffffff;
+    t19  += t18 >> 29; r[18] = t18 & 0x1fffffff;
+    t20  += t19 >> 29; r[19] = t19 & 0x1fffffff;
+    t21  += t20 >> 29; r[20] = t20 & 0x1fffffff;
+    t22  += t21 >> 29; r[21] = t21 & 0x1fffffff;
+    r[23] = (sp_digit)(t22 >> 29);
+                       r[22] = t22 & 0x1fffffff;
 }
 
 /* Add b to a into r. (r = a + b)
@@ -664,7 +611,7 @@ SP_NOINLINE static void sp_2048_sqr_15(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_add_15(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_12(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     r[ 0] = a[ 0] + b[ 0];
@@ -679,9 +626,6 @@ SP_NOINLINE static int sp_2048_add_15(sp_digit* r, const sp_digit* a,
     r[ 9] = a[ 9] + b[ 9];
     r[10] = a[10] + b[10];
     r[11] = a[11] + b[11];
-    r[12] = a[12] + b[12];
-    r[13] = a[13] + b[13];
-    r[14] = a[14] + b[14];
 
     return 0;
 }
@@ -692,7 +636,7 @@ SP_NOINLINE static int sp_2048_add_15(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_sub_30(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_24(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -707,12 +651,6 @@ SP_NOINLINE static int sp_2048_sub_30(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] - b[i + 6];
         r[i + 7] = a[i + 7] - b[i + 7];
     }
-    r[24] = a[24] - b[24];
-    r[25] = a[25] - b[25];
-    r[26] = a[26] - b[26];
-    r[27] = a[27] - b[27];
-    r[28] = a[28] - b[28];
-    r[29] = a[29] - b[29];
 
     return 0;
 }
@@ -723,7 +661,7 @@ SP_NOINLINE static int sp_2048_sub_30(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_add_30(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_24(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -738,64 +676,129 @@ SP_NOINLINE static int sp_2048_add_30(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] + b[i + 6];
         r[i + 7] = a[i + 7] + b[i + 7];
     }
-    r[24] = a[24] + b[24];
-    r[25] = a[25] + b[25];
-    r[26] = a[26] + b[26];
-    r[27] = a[27] + b[27];
-    r[28] = a[28] + b[28];
-    r[29] = a[29] + b[29];
 
     return 0;
 }
 
+/* Normalize the values in each word to 29 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_12(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 11; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+#else
+    a[1] += a[0] >> 29; a[0] &= 0x1fffffff;
+    a[2] += a[1] >> 29; a[1] &= 0x1fffffff;
+    a[3] += a[2] >> 29; a[2] &= 0x1fffffff;
+    a[4] += a[3] >> 29; a[3] &= 0x1fffffff;
+    a[5] += a[4] >> 29; a[4] &= 0x1fffffff;
+    a[6] += a[5] >> 29; a[5] &= 0x1fffffff;
+    a[7] += a[6] >> 29; a[6] &= 0x1fffffff;
+    a[8] += a[7] >> 29; a[7] &= 0x1fffffff;
+    a[9] += a[8] >> 29; a[8] &= 0x1fffffff;
+    a[10] += a[9] >> 29; a[9] &= 0x1fffffff;
+    a[11] += a[10] >> 29; a[10] &= 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+/* Normalize the values in each word to 29 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_24(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 23; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 16; i += 8) {
+        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
+        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
+        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
+        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
+        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
+        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
+        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
+        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
+    }
+    a[17] += a[16] >> 29; a[16] &= 0x1fffffff;
+    a[18] += a[17] >> 29; a[17] &= 0x1fffffff;
+    a[19] += a[18] >> 29; a[18] &= 0x1fffffff;
+    a[20] += a[19] >> 29; a[19] &= 0x1fffffff;
+    a[21] += a[20] >> 29; a[20] &= 0x1fffffff;
+    a[22] += a[21] >> 29; a[21] &= 0x1fffffff;
+    a[23] += a[22] >> 29; a[22] &= 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
 /* Multiply a and b into r. (r = a * b)
  *
  * r  A single precision integer.
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    sp_digit p0[30];
-    sp_digit p1[30];
-    sp_digit p2[30];
-    sp_digit p3[30];
-    sp_digit p4[30];
-    sp_digit p5[30];
-    sp_digit t0[30];
-    sp_digit t1[30];
-    sp_digit t2[30];
-    sp_digit a0[15];
-    sp_digit a1[15];
-    sp_digit a2[15];
-    sp_digit b0[15];
-    sp_digit b1[15];
-    sp_digit b2[15];
-    (void)sp_2048_add_15(a0, a, &a[15]);
-    (void)sp_2048_add_15(b0, b, &b[15]);
-    (void)sp_2048_add_15(a1, &a[15], &a[30]);
-    (void)sp_2048_add_15(b1, &b[15], &b[30]);
-    (void)sp_2048_add_15(a2, a0, &a[30]);
-    (void)sp_2048_add_15(b2, b0, &b[30]);
-    sp_2048_mul_15(p0, a, b);
-    sp_2048_mul_15(p2, &a[15], &b[15]);
-    sp_2048_mul_15(p4, &a[30], &b[30]);
-    sp_2048_mul_15(p1, a0, b0);
-    sp_2048_mul_15(p3, a1, b1);
-    sp_2048_mul_15(p5, a2, b2);
-    XMEMSET(r, 0, sizeof(*r)*2U*45U);
-    (void)sp_2048_sub_30(t0, p3, p2);
-    (void)sp_2048_sub_30(t1, p1, p2);
-    (void)sp_2048_sub_30(t2, p5, t0);
-    (void)sp_2048_sub_30(t2, t2, t1);
-    (void)sp_2048_sub_30(t0, t0, p4);
-    (void)sp_2048_sub_30(t1, t1, p0);
-    (void)sp_2048_add_30(r, r, p0);
-    (void)sp_2048_add_30(&r[15], &r[15], t1);
-    (void)sp_2048_add_30(&r[30], &r[30], t2);
-    (void)sp_2048_add_30(&r[45], &r[45], t0);
-    (void)sp_2048_add_30(&r[60], &r[60], p4);
+    sp_digit p0[24];
+    sp_digit p1[24];
+    sp_digit p2[24];
+    sp_digit p3[24];
+    sp_digit p4[24];
+    sp_digit p5[24];
+    sp_digit t0[24];
+    sp_digit t1[24];
+    sp_digit t2[24];
+    sp_digit a0[12];
+    sp_digit a1[12];
+    sp_digit a2[12];
+    sp_digit b0[12];
+    sp_digit b1[12];
+    sp_digit b2[12];
+    (void)sp_2048_add_12(a0, a, &a[12]);
+    sp_2048_norm_12(a0);
+    (void)sp_2048_add_12(b0, b, &b[12]);
+    sp_2048_norm_12(b0);
+    (void)sp_2048_add_12(a1, &a[12], &a[24]);
+    sp_2048_norm_12(a1);
+    (void)sp_2048_add_12(b1, &b[12], &b[24]);
+    sp_2048_norm_12(b1);
+    (void)sp_2048_add_12(a2, a0, &a[24]);
+    sp_2048_norm_12(a1);
+    (void)sp_2048_add_12(b2, b0, &b[24]);
+    sp_2048_norm_12(b2);
+    sp_2048_mul_12(p0, a, b);
+    sp_2048_mul_12(p2, &a[12], &b[12]);
+    sp_2048_mul_12(p4, &a[24], &b[24]);
+    sp_2048_mul_12(p1, a0, b0);
+    sp_2048_mul_12(p3, a1, b1);
+    sp_2048_mul_12(p5, a2, b2);
+    XMEMSET(r, 0, sizeof(*r)*2U*36U);
+    (void)sp_2048_sub_24(t0, p3, p2);
+    (void)sp_2048_sub_24(t1, p1, p2);
+    (void)sp_2048_sub_24(t2, p5, t0);
+    (void)sp_2048_sub_24(t2, t2, t1);
+    sp_2048_norm_24(t2);
+    (void)sp_2048_sub_24(t0, t0, p4);
+    sp_2048_norm_24(t0);
+    (void)sp_2048_sub_24(t1, t1, p0);
+    sp_2048_norm_24(t1);
+    (void)sp_2048_add_24(r, r, p0);
+    (void)sp_2048_add_24(&r[12], &r[12], t1);
+    (void)sp_2048_add_24(&r[24], &r[24], t2);
+    (void)sp_2048_add_24(&r[36], &r[36], t0);
+    (void)sp_2048_add_24(&r[48], &r[48], p4);
+    sp_2048_norm_72(r);
 }
 
 /* Square a into r. (r = a * a)
@@ -803,41 +806,48 @@ SP_NOINLINE static void sp_2048_mul_45(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_45(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
 {
-    sp_digit p0[30];
-    sp_digit p1[30];
-    sp_digit p2[30];
-    sp_digit p3[30];
-    sp_digit p4[30];
-    sp_digit p5[30];
-    sp_digit t0[30];
-    sp_digit t1[30];
-    sp_digit t2[30];
-    sp_digit a0[15];
-    sp_digit a1[15];
-    sp_digit a2[15];
-    (void)sp_2048_add_15(a0, a, &a[15]);
-    (void)sp_2048_add_15(a1, &a[15], &a[30]);
-    (void)sp_2048_add_15(a2, a0, &a[30]);
-    sp_2048_sqr_15(p0, a);
-    sp_2048_sqr_15(p2, &a[15]);
-    sp_2048_sqr_15(p4, &a[30]);
-    sp_2048_sqr_15(p1, a0);
-    sp_2048_sqr_15(p3, a1);
-    sp_2048_sqr_15(p5, a2);
-    XMEMSET(r, 0, sizeof(*r)*2U*45U);
-    (void)sp_2048_sub_30(t0, p3, p2);
-    (void)sp_2048_sub_30(t1, p1, p2);
-    (void)sp_2048_sub_30(t2, p5, t0);
-    (void)sp_2048_sub_30(t2, t2, t1);
-    (void)sp_2048_sub_30(t0, t0, p4);
-    (void)sp_2048_sub_30(t1, t1, p0);
-    (void)sp_2048_add_30(r, r, p0);
-    (void)sp_2048_add_30(&r[15], &r[15], t1);
-    (void)sp_2048_add_30(&r[30], &r[30], t2);
-    (void)sp_2048_add_30(&r[45], &r[45], t0);
-    (void)sp_2048_add_30(&r[60], &r[60], p4);
+    sp_digit p0[24];
+    sp_digit p1[24];
+    sp_digit p2[24];
+    sp_digit p3[24];
+    sp_digit p4[24];
+    sp_digit p5[24];
+    sp_digit t0[24];
+    sp_digit t1[24];
+    sp_digit t2[24];
+    sp_digit a0[12];
+    sp_digit a1[12];
+    sp_digit a2[12];
+    (void)sp_2048_add_12(a0, a, &a[12]);
+    sp_2048_norm_12(a0);
+    (void)sp_2048_add_12(a1, &a[12], &a[24]);
+    sp_2048_norm_12(a1);
+    (void)sp_2048_add_12(a2, a0, &a[24]);
+    sp_2048_norm_12(a2);
+    sp_2048_sqr_12(p0, a);
+    sp_2048_sqr_12(p2, &a[12]);
+    sp_2048_sqr_12(p4, &a[24]);
+    sp_2048_sqr_12(p1, a0);
+    sp_2048_sqr_12(p3, a1);
+    sp_2048_sqr_12(p5, a2);
+    XMEMSET(r, 0, sizeof(*r)*2U*36U);
+    (void)sp_2048_sub_24(t0, p3, p2);
+    (void)sp_2048_sub_24(t1, p1, p2);
+    (void)sp_2048_sub_24(t2, p5, t0);
+    (void)sp_2048_sub_24(t2, t2, t1);
+    sp_2048_norm_24(t2);
+    (void)sp_2048_sub_24(t0, t0, p4);
+    sp_2048_norm_24(t0);
+    (void)sp_2048_sub_24(t1, t1, p0);
+    sp_2048_norm_24(t1);
+    (void)sp_2048_add_24(r, r, p0);
+    (void)sp_2048_add_24(&r[12], &r[12], t1);
+    (void)sp_2048_add_24(&r[24], &r[24], t2);
+    (void)sp_2048_add_24(&r[36], &r[36], t0);
+    (void)sp_2048_add_24(&r[48], &r[48], p4);
+    sp_2048_norm_72(r);
 }
 
 /* Add b to a into r. (r = a + b)
@@ -846,12 +856,12 @@ SP_NOINLINE static void sp_2048_sqr_45(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_add_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 40; i += 8) {
+    for (i = 0; i < 32; i += 8) {
         r[i + 0] = a[i + 0] + b[i + 0];
         r[i + 1] = a[i + 1] + b[i + 1];
         r[i + 2] = a[i + 2] + b[i + 2];
@@ -861,11 +871,10 @@ SP_NOINLINE static int sp_2048_add_45(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] + b[i + 6];
         r[i + 7] = a[i + 7] + b[i + 7];
     }
-    r[40] = a[40] + b[40];
-    r[41] = a[41] + b[41];
-    r[42] = a[42] + b[42];
-    r[43] = a[43] + b[43];
-    r[44] = a[44] + b[44];
+    r[32] = a[32] + b[32];
+    r[33] = a[33] + b[33];
+    r[34] = a[34] + b[34];
+    r[35] = a[35] + b[35];
 
     return 0;
 }
@@ -876,12 +885,12 @@ SP_NOINLINE static int sp_2048_add_45(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_add_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_72(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 88; i += 8) {
+    for (i = 0; i < 72; i += 8) {
         r[i + 0] = a[i + 0] + b[i + 0];
         r[i + 1] = a[i + 1] + b[i + 1];
         r[i + 2] = a[i + 2] + b[i + 2];
@@ -891,8 +900,6 @@ SP_NOINLINE static int sp_2048_add_90(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] + b[i + 6];
         r[i + 7] = a[i + 7] + b[i + 7];
     }
-    r[88] = a[88] + b[88];
-    r[89] = a[89] + b[89];
 
     return 0;
 }
@@ -903,12 +910,12 @@ SP_NOINLINE static int sp_2048_add_90(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_sub_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_72(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 88; i += 8) {
+    for (i = 0; i < 72; i += 8) {
         r[i + 0] = a[i + 0] - b[i + 0];
         r[i + 1] = a[i + 1] - b[i + 1];
         r[i + 2] = a[i + 2] - b[i + 2];
@@ -918,34 +925,69 @@ SP_NOINLINE static int sp_2048_sub_90(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] - b[i + 6];
         r[i + 7] = a[i + 7] - b[i + 7];
     }
-    r[88] = a[88] - b[88];
-    r[89] = a[89] - b[89];
 
     return 0;
 }
 
+/* Normalize the values in each word to 29 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_144(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 143; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 136; i += 8) {
+        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
+        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
+        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
+        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
+        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
+        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
+        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
+        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
+    }
+    a[137] += a[136] >> 29; a[136] &= 0x1fffffff;
+    a[138] += a[137] >> 29; a[137] &= 0x1fffffff;
+    a[139] += a[138] >> 29; a[138] &= 0x1fffffff;
+    a[140] += a[139] >> 29; a[139] &= 0x1fffffff;
+    a[141] += a[140] >> 29; a[140] &= 0x1fffffff;
+    a[142] += a[141] >> 29; a[141] &= 0x1fffffff;
+    a[143] += a[142] >> 29; a[142] &= 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
 /* Multiply a and b into r. (r = a * b)
  *
  * r  A single precision integer.
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_72(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     sp_digit* z0 = r;
-    sp_digit z1[90];
+    sp_digit z1[72];
     sp_digit* a1 = z1;
-    sp_digit b1[45];
-    sp_digit* z2 = r + 90;
-    (void)sp_2048_add_45(a1, a, &a[45]);
-    (void)sp_2048_add_45(b1, b, &b[45]);
-    sp_2048_mul_45(z2, &a[45], &b[45]);
-    sp_2048_mul_45(z0, a, b);
-    sp_2048_mul_45(z1, a1, b1);
-    (void)sp_2048_sub_90(z1, z1, z2);
-    (void)sp_2048_sub_90(z1, z1, z0);
-    (void)sp_2048_add_90(r + 45, r + 45, z1);
+    sp_digit b1[36];
+    sp_digit* z2 = r + 72;
+    (void)sp_2048_add_36(a1, a, &a[36]);
+    sp_2048_norm_36(a1);
+    (void)sp_2048_add_36(b1, b, &b[36]);
+    sp_2048_norm_36(b1);
+    sp_2048_mul_36(z2, &a[36], &b[36]);
+    sp_2048_mul_36(z0, a, b);
+    sp_2048_mul_36(z1, a1, b1);
+    (void)sp_2048_sub_72(z1, z1, z2);
+    (void)sp_2048_sub_72(z1, z1, z0);
+    (void)sp_2048_add_72(r + 36, r + 36, z1);
+    sp_2048_norm_144(r);
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -953,19 +995,21 @@ SP_NOINLINE static void sp_2048_mul_90(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_90(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_72(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
-    sp_digit z1[90];
+    sp_digit z1[72];
     sp_digit* a1 = z1;
-    sp_digit* z2 = r + 90;
-    (void)sp_2048_add_45(a1, a, &a[45]);
-    sp_2048_sqr_45(z2, &a[45]);
-    sp_2048_sqr_45(z0, a);
-    sp_2048_sqr_45(z1, a1);
-    (void)sp_2048_sub_90(z1, z1, z2);
-    (void)sp_2048_sub_90(z1, z1, z0);
-    (void)sp_2048_add_90(r + 45, r + 45, z1);
+    sp_digit* z2 = r + 72;
+    (void)sp_2048_add_36(a1, a, &a[36]);
+    sp_2048_norm_36(a1);
+    sp_2048_sqr_36(z2, &a[36]);
+    sp_2048_sqr_36(z0, a);
+    sp_2048_sqr_36(z1, a1);
+    (void)sp_2048_sub_72(z1, z1, z2);
+    (void)sp_2048_sub_72(z1, z1, z0);
+    (void)sp_2048_add_72(r + 36, r + 36, z1);
+    sp_2048_norm_144(r);
 }
 
 #endif /* !WOLFSSL_SP_SMALL */
@@ -976,12 +1020,12 @@ SP_NOINLINE static void sp_2048_sqr_90(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_add_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_72(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 90; i++) {
+    for (i = 0; i < 72; i++) {
         r[i] = a[i] + b[i];
     }
 
@@ -995,12 +1039,12 @@ SP_NOINLINE static int sp_2048_add_90(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_sub_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_72(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 90; i++) {
+    for (i = 0; i < 72; i++) {
         r[i] = a[i] - b[i];
     }
 
@@ -1015,34 +1059,53 @@ SP_NOINLINE static int sp_2048_sub_90(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_72(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 lo;
 
-    c = ((int64_t)a[89]) * b[89];
-    r[179] = (sp_digit)(c >> 23);
-    c = (c & 0x7fffff) << 23;
-    for (k = 177; k >= 0; k--) {
-        for (i = 89; i >= 0; i--) {
-            j = k - i;
-            if (j >= 90) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
+    c = ((sp_uint64)a[71]) * b[71];
+    r[143] = (sp_digit)(c >> 29);
+    c &= 0x1fffffff;
+    for (k = 141; k >= 0; k--) {
+        if (k >= 72) {
+            i = k - 71;
+            imax = 71;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        if (imax - i > 15) {
+            int imaxlo;
+            lo = 0;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
+                for (; i <= imax && i < imaxlo + 15; i++) {
+                    lo += ((sp_uint64)a[i]) * b[k - i];
+                }
+                c += lo >> 29;
+                lo &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
+        }
+        else {
+            lo = 0;
+            for (; i <= imax; i++) {
+                lo += ((sp_uint64)a[i]) * b[k - i];
+            }
+            c += lo >> 29;
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
         }
-        r[k + 2] += (sp_digit)(c >> 46);
-        r[k + 1] = (sp_digit)((c >> 23) & 0x7fffff);
-        c = (c & 0x7fffff) << 23;
     }
-    r[0] = (sp_digit)(c >> 23);
+    r[0] = (sp_digit)c;
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -1050,37 +1113,63 @@ SP_NOINLINE static void sp_2048_mul_90(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_90(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_72(sp_digit* r, const sp_digit* a)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 t;
 
-    c = ((int64_t)a[89]) * a[89];
-    r[179] = (sp_digit)(c >> 23);
-    c = (c & 0x7fffff) << 23;
-    for (k = 177; k >= 0; k--) {
-        for (i = 89; i >= 0; i--) {
-            j = k - i;
-            if (j >= 90 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * a[j] * 2;
+    c = ((sp_uint64)a[71]) * a[71];
+    r[143] = (sp_digit)(c >> 29);
+    c = (c & 0x1fffffff) << 29;
+    for (k = 141; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
         }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
+        if (k < 71) {
+            imax = k;
         }
+        else {
+            imax = 71;
+        }
+        if (imax - i >= 14) {
+            int imaxlo;
+            sp_uint64 hi;
 
-        r[k + 2] += (sp_digit)(c >> 46);
-        r[k + 1] = (sp_digit)((c >> 23) & 0x7fffff);
-        c = (c & 0x7fffff) << 23;
+            hi = c >> 29;
+            c &= 0x1fffffff;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
+                t = 0;
+                for (; i <= imax && i < imaxlo + 14; i++) {
+                    t += ((sp_uint64)a[i]) * a[k - i];
+                }
+                c += t * 2;
+
+                hi += c >> 29;
+                c &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(hi >> 29);
+            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
+            c <<= 29;
+        }
+        else
+        {
+            t = 0;
+            for (; i <= imax; i++) {
+                t += ((sp_uint64)a[i]) * a[k - i];
+            }
+            c += t * 2;
+
+            r[k + 2] += (sp_digit) (c >> 58);
+            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
+            c = (c & 0x1fffffff) << 29;
+        }
     }
-    r[0] = (sp_digit)(c >> 23);
+    r[0] = (sp_digit)(c >> 29);
 }
 
 #endif /* WOLFSSL_SP_SMALL */
@@ -1092,12 +1181,12 @@ SP_NOINLINE static void sp_2048_sqr_90(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_add_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 45; i++) {
+    for (i = 0; i < 36; i++) {
         r[i] = a[i] + b[i];
     }
 
@@ -1111,12 +1200,12 @@ SP_NOINLINE static int sp_2048_add_45(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_sub_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 45; i++) {
+    for (i = 0; i < 36; i++) {
         r[i] = a[i] - b[i];
     }
 
@@ -1130,12 +1219,12 @@ SP_NOINLINE static int sp_2048_sub_45(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_2048_sub_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 40; i += 8) {
+    for (i = 0; i < 32; i += 8) {
         r[i + 0] = a[i + 0] - b[i + 0];
         r[i + 1] = a[i + 1] - b[i + 1];
         r[i + 2] = a[i + 2] - b[i + 2];
@@ -1145,11 +1234,10 @@ SP_NOINLINE static int sp_2048_sub_45(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] - b[i + 6];
         r[i + 7] = a[i + 7] - b[i + 7];
     }
-    r[40] = a[40] - b[40];
-    r[41] = a[41] - b[41];
-    r[42] = a[42] - b[42];
-    r[43] = a[43] - b[43];
-    r[44] = a[44] - b[44];
+    r[32] = a[32] - b[32];
+    r[33] = a[33] - b[33];
+    r[34] = a[34] - b[34];
+    r[35] = a[35] - b[35];
 
     return 0;
 }
@@ -1162,34 +1250,53 @@ SP_NOINLINE static int sp_2048_sub_45(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_mul_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 lo;
 
-    c = ((int64_t)a[44]) * b[44];
-    r[89] = (sp_digit)(c >> 23);
-    c = (c & 0x7fffff) << 23;
-    for (k = 87; k >= 0; k--) {
-        for (i = 44; i >= 0; i--) {
-            j = k - i;
-            if (j >= 45) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
+    c = ((sp_uint64)a[35]) * b[35];
+    r[71] = (sp_digit)(c >> 29);
+    c &= 0x1fffffff;
+    for (k = 69; k >= 0; k--) {
+        if (k >= 36) {
+            i = k - 35;
+            imax = 35;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        if (imax - i > 15) {
+            int imaxlo;
+            lo = 0;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
+                for (; i <= imax && i < imaxlo + 15; i++) {
+                    lo += ((sp_uint64)a[i]) * b[k - i];
+                }
+                c += lo >> 29;
+                lo &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
+        }
+        else {
+            lo = 0;
+            for (; i <= imax; i++) {
+                lo += ((sp_uint64)a[i]) * b[k - i];
+            }
+            c += lo >> 29;
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
         }
-        r[k + 2] += (sp_digit)(c >> 46);
-        r[k + 1] = (sp_digit)((c >> 23) & 0x7fffff);
-        c = (c & 0x7fffff) << 23;
     }
-    r[0] = (sp_digit)(c >> 23);
+    r[0] = (sp_digit)c;
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -1197,37 +1304,63 @@ SP_NOINLINE static void sp_2048_mul_45(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_2048_sqr_45(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 t;
 
-    c = ((int64_t)a[44]) * a[44];
-    r[89] = (sp_digit)(c >> 23);
-    c = (c & 0x7fffff) << 23;
-    for (k = 87; k >= 0; k--) {
-        for (i = 44; i >= 0; i--) {
-            j = k - i;
-            if (j >= 45 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * a[j] * 2;
+    c = ((sp_uint64)a[35]) * a[35];
+    r[71] = (sp_digit)(c >> 29);
+    c = (c & 0x1fffffff) << 29;
+    for (k = 69; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
         }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
+        if (k < 35) {
+            imax = k;
         }
+        else {
+            imax = 35;
+        }
+        if (imax - i >= 14) {
+            int imaxlo;
+            sp_uint64 hi;
 
-        r[k + 2] += (sp_digit)(c >> 46);
-        r[k + 1] = (sp_digit)((c >> 23) & 0x7fffff);
-        c = (c & 0x7fffff) << 23;
+            hi = c >> 29;
+            c &= 0x1fffffff;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
+                t = 0;
+                for (; i <= imax && i < imaxlo + 14; i++) {
+                    t += ((sp_uint64)a[i]) * a[k - i];
+                }
+                c += t * 2;
+
+                hi += c >> 29;
+                c &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(hi >> 29);
+            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
+            c <<= 29;
+        }
+        else
+        {
+            t = 0;
+            for (; i <= imax; i++) {
+                t += ((sp_uint64)a[i]) * a[k - i];
+            }
+            c += t * 2;
+
+            r[k + 2] += (sp_digit) (c >> 58);
+            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
+            c = (c & 0x1fffffff) << 29;
+        }
     }
-    r[0] = (sp_digit)(c >> 23);
+    r[0] = (sp_digit)(c >> 29);
 }
 
 #endif /* WOLFSSL_SP_SMALL */
@@ -1248,10 +1381,10 @@ static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho)
     x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
     x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
     x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
-    x &= 0x7fffff;
+    x &= 0x1fffffff;
 
     /* rho = -1/m mod b */
-    *rho = ((sp_digit)1 << 23) - x;
+    *rho = ((sp_digit)1 << 29) - x;
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -1260,56 +1393,50 @@ static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho)
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_2048_mul_d_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_d_72(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 90; i++) {
+    for (i = 0; i < 72; i++) {
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
     }
-    r[90] = (sp_digit)t;
+    r[72] = (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     sp_digit t2;
-    int64_t p[4];
+    sp_int64 p[4];
     int i;
 
-    for (i = 0; i < 88; i += 4) {
+    for (i = 0; i < 72; i += 4) {
         p[0] = tb * a[i + 0];
         p[1] = tb * a[i + 1];
         p[2] = tb * a[i + 2];
         p[3] = tb * a[i + 3];
         t += p[0];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 0] = (sp_digit)t2;
         t += p[1];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 1] = (sp_digit)t2;
         t += p[2];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 2] = (sp_digit)t2;
         t += p[3];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 3] = (sp_digit)t2;
     }
-    t += tb * a[88];
-    r[88] = (sp_digit)(t & 0x7fffff);
-    t >>= 23;
-    t += tb * a[89];
-    r[89] = (sp_digit)(t & 0x7fffff);
-    t >>= 23;
-    r[90] = (sp_digit)(t & 0x7fffff);
+    r[72] = (sp_digit)(t & 0x1fffffff);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
@@ -1320,37 +1447,36 @@ SP_NOINLINE static void sp_2048_mul_d_90(sp_digit* r, const sp_digit* a,
  * r  A single precision number.
  * m  A single precision number.
  */
-static void sp_2048_mont_norm_45(sp_digit* r, const sp_digit* m)
+static void sp_2048_mont_norm_36(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=0; i<44; i++) {
-        r[i] = 0x7fffff;
+    for (i=0; i<35; i++) {
+        r[i] = 0x1fffffff;
     }
 #else
     int i;
 
-    for (i = 0; i < 40; i += 8) {
-        r[i + 0] = 0x7fffff;
-        r[i + 1] = 0x7fffff;
-        r[i + 2] = 0x7fffff;
-        r[i + 3] = 0x7fffff;
-        r[i + 4] = 0x7fffff;
-        r[i + 5] = 0x7fffff;
-        r[i + 6] = 0x7fffff;
-        r[i + 7] = 0x7fffff;
+    for (i = 0; i < 32; i += 8) {
+        r[i + 0] = 0x1fffffff;
+        r[i + 1] = 0x1fffffff;
+        r[i + 2] = 0x1fffffff;
+        r[i + 3] = 0x1fffffff;
+        r[i + 4] = 0x1fffffff;
+        r[i + 5] = 0x1fffffff;
+        r[i + 6] = 0x1fffffff;
+        r[i + 7] = 0x1fffffff;
     }
-    r[40] = 0x7fffff;
-    r[41] = 0x7fffff;
-    r[42] = 0x7fffff;
-    r[43] = 0x7fffff;
-#endif
-    r[44] = 0xfffL;
+    r[32] = 0x1fffffff;
+    r[33] = 0x1fffffff;
+    r[34] = 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+    r[35] = 0x1ffL;
 
     /* r = (2^n - 1) mod n */
-    (void)sp_2048_sub_45(r, r, m);
+    (void)sp_2048_sub_36(r, r, m);
 
     /* Add one so r = 2^n mod m */
     r[0] += 1;
@@ -1363,24 +1489,23 @@ static void sp_2048_mont_norm_45(sp_digit* r, const sp_digit* m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static sp_digit sp_2048_cmp_45(const sp_digit* a, const sp_digit* b)
+static sp_digit sp_2048_cmp_36(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=44; i>=0; i--) {
+    for (i=35; i>=0; i--) {
         r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
 #else
     int i;
 
-    r |= (a[44] - b[44]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[43] - b[43]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[42] - b[42]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[41] - b[41]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[40] - b[40]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    for (i = 32; i >= 0; i -= 8) {
+    r |= (a[35] - b[35]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[34] - b[34]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[33] - b[33]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[32] - b[32]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    for (i = 24; i >= 0; i -= 8) {
         r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
@@ -1403,19 +1528,19 @@ static sp_digit sp_2048_cmp_45(const sp_digit* a, const sp_digit* b)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static void sp_2048_cond_sub_45(sp_digit* r, const sp_digit* a,
+static void sp_2048_cond_sub_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 45; i++) {
+    for (i = 0; i < 36; i++) {
         r[i] = a[i] - (b[i] & m);
     }
 #else
     int i;
 
-    for (i = 0; i < 40; i += 8) {
+    for (i = 0; i < 32; i += 8) {
         r[i + 0] = a[i + 0] - (b[i + 0] & m);
         r[i + 1] = a[i + 1] - (b[i + 1] & m);
         r[i + 2] = a[i + 2] - (b[i + 2] & m);
@@ -1425,11 +1550,10 @@ static void sp_2048_cond_sub_45(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] - (b[i + 6] & m);
         r[i + 7] = a[i + 7] - (b[i + 7] & m);
     }
-    r[40] = a[40] - (b[40] & m);
-    r[41] = a[41] - (b[41] & m);
-    r[42] = a[42] - (b[42] & m);
-    r[43] = a[43] - (b[43] & m);
-    r[44] = a[44] - (b[44] & m);
+    r[32] = a[32] - (b[32] & m);
+    r[33] = a[33] - (b[33] & m);
+    r[34] = a[34] - (b[34] & m);
+    r[35] = a[35] - (b[35] & m);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
@@ -1439,85 +1563,100 @@ static void sp_2048_cond_sub_45(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_2048_mul_add_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 45; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x7fffff;
-        t >>= 23;
+    for (i = 0; i < 36; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0x1fffffff;
+        t >>= 29;
     }
-    r[45] += (sp_digit)t;
+    r[36] += (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t[8];
+#ifdef WOLFSSL_SP_SMALL
+    sp_int64 tb = b;
+    sp_int64 t[4];
     int i;
 
-    t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x7fffff);
-    for (i = 0; i < 40; i += 8) {
-        t[1] = tb * a[i+1];
-        r[i+1] += (sp_digit)((t[0] >> 23) + (t[1] & 0x7fffff));
-        t[2] = tb * a[i+2];
-        r[i+2] += (sp_digit)((t[1] >> 23) + (t[2] & 0x7fffff));
-        t[3] = tb * a[i+3];
-        r[i+3] += (sp_digit)((t[2] >> 23) + (t[3] & 0x7fffff));
-        t[4] = tb * a[i+4];
-        r[i+4] += (sp_digit)((t[3] >> 23) + (t[4] & 0x7fffff));
-        t[5] = tb * a[i+5];
-        r[i+5] += (sp_digit)((t[4] >> 23) + (t[5] & 0x7fffff));
-        t[6] = tb * a[i+6];
-        r[i+6] += (sp_digit)((t[5] >> 23) + (t[6] & 0x7fffff));
-        t[7] = tb * a[i+7];
-        r[i+7] += (sp_digit)((t[6] >> 23) + (t[7] & 0x7fffff));
-        t[0] = tb * a[i+8];
-        r[i+8] += (sp_digit)((t[7] >> 23) + (t[0] & 0x7fffff));
+    t[0] = 0;
+    for (i = 0; i < 32; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[0]  = t[3] >> 29;
     }
-    t[1] = tb * a[41];
-    r[41] += (sp_digit)((t[0] >> 23) + (t[1] & 0x7fffff));
-    t[2] = tb * a[42];
-    r[42] += (sp_digit)((t[1] >> 23) + (t[2] & 0x7fffff));
-    t[3] = tb * a[43];
-    r[43] += (sp_digit)((t[2] >> 23) + (t[3] & 0x7fffff));
-    t[4] = tb * a[44];
-    r[44] += (sp_digit)((t[3] >> 23) + (t[4] & 0x7fffff));
-    r[45] +=  (sp_digit)(t[4] >> 23);
+    t[0] += (tb * a[32]) + r[32];
+    t[1]  = (tb * a[33]) + r[33];
+    t[2]  = (tb * a[34]) + r[34];
+    t[3]  = (tb * a[35]) + r[35];
+    r[32] = t[0] & 0x1fffffff;
+    t[1] += t[0] >> 29;
+    r[33] = t[1] & 0x1fffffff;
+    t[2] += t[1] >> 29;
+    r[34] = t[2] & 0x1fffffff;
+    t[3] += t[2] >> 29;
+    r[35] = t[3] & 0x1fffffff;
+    r[36] +=  (sp_digit)(t[3] >> 29);
+#else
+    sp_int64 tb = b;
+    sp_int64 t[8];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 32; i += 8) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        t[4]  = (tb * a[i+4]) + r[i+4];
+        t[5]  = (tb * a[i+5]) + r[i+5];
+        t[6]  = (tb * a[i+6]) + r[i+6];
+        t[7]  = (tb * a[i+7]) + r[i+7];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[4] += t[3] >> 29;
+        r[i+4] = t[4] & 0x1fffffff;
+        t[5] += t[4] >> 29;
+        r[i+5] = t[5] & 0x1fffffff;
+        t[6] += t[5] >> 29;
+        r[i+6] = t[6] & 0x1fffffff;
+        t[7] += t[6] >> 29;
+        r[i+7] = t[7] & 0x1fffffff;
+        t[0]  = t[7] >> 29;
+    }
+    t[0] += (tb * a[32]) + r[32];
+    t[1]  = (tb * a[33]) + r[33];
+    t[2]  = (tb * a[34]) + r[34];
+    t[3]  = (tb * a[35]) + r[35];
+    r[32] = t[0] & 0x1fffffff;
+    t[1] += t[0] >> 29;
+    r[33] = t[1] & 0x1fffffff;
+    t[2] += t[1] >> 29;
+    r[34] = t[2] & 0x1fffffff;
+    t[3] += t[2] >> 29;
+    r[35] = t[3] & 0x1fffffff;
+    r[36] +=  (sp_digit)(t[3] >> 29);
 #endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 23.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_2048_norm_45(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 44; i++) {
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
-    }
-#else
-    int i;
-    for (i = 0; i < 40; i += 8) {
-        a[i+1] += a[i+0] >> 23; a[i+0] &= 0x7fffff;
-        a[i+2] += a[i+1] >> 23; a[i+1] &= 0x7fffff;
-        a[i+3] += a[i+2] >> 23; a[i+2] &= 0x7fffff;
-        a[i+4] += a[i+3] >> 23; a[i+3] &= 0x7fffff;
-        a[i+5] += a[i+4] >> 23; a[i+4] &= 0x7fffff;
-        a[i+6] += a[i+5] >> 23; a[i+5] &= 0x7fffff;
-        a[i+7] += a[i+6] >> 23; a[i+6] &= 0x7fffff;
-        a[i+8] += a[i+7] >> 23; a[i+7] &= 0x7fffff;
-    }
-    a[40+1] += a[40] >> 23; a[40] &= 0x7fffff;
-    a[41+1] += a[41] >> 23; a[41] &= 0x7fffff;
-    a[42+1] += a[42] >> 23; a[42] &= 0x7fffff;
-    a[43+1] += a[43] >> 23; a[43] &= 0x7fffff;
-#endif
+#endif /* !WOLFSSL_SP_LARGE_CODE */
 }
 
 /* Shift the result in the high 1024 bits down to the bottom.
@@ -1525,48 +1664,47 @@ static void sp_2048_norm_45(sp_digit* a)
  * r  A single precision number.
  * a  A single precision number.
  */
-static void sp_2048_mont_shift_45(sp_digit* r, const sp_digit* a)
+static void sp_2048_mont_shift_36(sp_digit* r, const sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
-    int64_t n = a[44] >> 12;
-    n += ((int64_t)a[45]) << 11;
+    sp_int64 n = a[35] >> 9;
+    n += ((sp_int64)a[36]) << 20;
 
-    for (i = 0; i < 44; i++) {
-        r[i] = n & 0x7fffff;
-        n >>= 23;
-        n += ((int64_t)a[46 + i]) << 11;
+    for (i = 0; i < 35; i++) {
+        r[i] = n & 0x1fffffff;
+        n >>= 29;
+        n += ((sp_int64)a[37 + i]) << 20;
     }
-    r[44] = (sp_digit)n;
+    r[35] = (sp_digit)n;
 #else
     int i;
-    int64_t n = a[44] >> 12;
-    n += ((int64_t)a[45]) << 11;
-    for (i = 0; i < 40; i += 8) {
-        r[i + 0] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 46]) << 11;
-        r[i + 1] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 47]) << 11;
-        r[i + 2] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 48]) << 11;
-        r[i + 3] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 49]) << 11;
-        r[i + 4] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 50]) << 11;
-        r[i + 5] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 51]) << 11;
-        r[i + 6] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 52]) << 11;
-        r[i + 7] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 53]) << 11;
+    sp_int64 n = a[35] >> 9;
+    n += ((sp_int64)a[36]) << 20;
+    for (i = 0; i < 32; i += 8) {
+        r[i + 0] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 37]) << 20;
+        r[i + 1] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 38]) << 20;
+        r[i + 2] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 39]) << 20;
+        r[i + 3] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 40]) << 20;
+        r[i + 4] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 41]) << 20;
+        r[i + 5] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 42]) << 20;
+        r[i + 6] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 43]) << 20;
+        r[i + 7] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 44]) << 20;
     }
-    r[40] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[86]) << 11;
-    r[41] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[87]) << 11;
-    r[42] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[88]) << 11;
-    r[43] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[89]) << 11;
-    r[44] = (sp_digit)n;
+    r[32] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[69]) << 20;
+    r[33] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[70]) << 20;
+    r[34] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[71]) << 20;
+    r[35] = (sp_digit)n;
 #endif /* WOLFSSL_SP_SMALL */
-    XMEMSET(&r[45], 0, sizeof(*r) * 45U);
+    XMEMSET(&r[36], 0, sizeof(*r) * 36U);
 }
 
 /* Reduce the number back to 2048 bits using Montgomery reduction.
@@ -1575,26 +1713,26 @@ static void sp_2048_mont_shift_45(sp_digit* r, const sp_digit* a)
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static void sp_2048_mont_reduce_45(sp_digit* a, const sp_digit* m, sp_digit mp)
+static void sp_2048_mont_reduce_36(sp_digit* a, const sp_digit* m, sp_digit mp)
 {
     int i;
     sp_digit mu;
 
-    sp_2048_norm_45(a + 45);
+    sp_2048_norm_36(a + 36);
 
-    for (i=0; i<44; i++) {
-        mu = (a[i] * mp) & 0x7fffff;
-        sp_2048_mul_add_45(a+i, m, mu);
-        a[i+1] += a[i] >> 23;
+    for (i=0; i<35; i++) {
+        mu = (a[i] * mp) & 0x1fffffff;
+        sp_2048_mul_add_36(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
     }
-    mu = (a[i] * mp) & 0xfffL;
-    sp_2048_mul_add_45(a+i, m, mu);
-    a[i+1] += a[i] >> 23;
-    a[i] &= 0x7fffff;
-    sp_2048_mont_shift_45(a, a);
-    sp_2048_cond_sub_45(a, a, m, 0 - (((a[44] >> 12) > 0) ?
+    mu = (a[i] * mp) & 0x1ffL;
+    sp_2048_mul_add_36(a+i, m, mu);
+    a[i+1] += a[i] >> 29;
+    a[i] &= 0x1fffffff;
+    sp_2048_mont_shift_36(a, a);
+    sp_2048_cond_sub_36(a, a, m, 0 - (((a[35] - m[35]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
-    sp_2048_norm_45(a);
+    sp_2048_norm_36(a);
 }
 
 /* Multiply two Montogmery form numbers mod the modulus (prime).
@@ -1606,11 +1744,11 @@ static void sp_2048_mont_reduce_45(sp_digit* a, const sp_digit* m, sp_digit mp)
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_2048_mont_mul_45(sp_digit* r, const sp_digit* a,
+static void sp_2048_mont_mul_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit* m, sp_digit mp)
 {
-    sp_2048_mul_45(r, a, b);
-    sp_2048_mont_reduce_45(r, m, mp);
+    sp_2048_mul_36(r, a, b);
+    sp_2048_mont_reduce_36(r, m, mp);
 }
 
 /* Square the Montgomery form number. (r = a * a mod m)
@@ -1620,11 +1758,11 @@ static void sp_2048_mont_mul_45(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_2048_mont_sqr_45(sp_digit* r, const sp_digit* a,
+static void sp_2048_mont_sqr_36(sp_digit* r, const sp_digit* a,
         const sp_digit* m, sp_digit mp)
 {
-    sp_2048_sqr_45(r, a);
-    sp_2048_mont_reduce_45(r, m, mp);
+    sp_2048_sqr_36(r, a);
+    sp_2048_mont_reduce_36(r, m, mp);
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -1633,53 +1771,50 @@ static void sp_2048_mont_sqr_45(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_2048_mul_d_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 45; i++) {
+    for (i = 0; i < 36; i++) {
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
     }
-    r[45] = (sp_digit)t;
+    r[36] = (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     sp_digit t2;
-    int64_t p[4];
+    sp_int64 p[4];
     int i;
 
-    for (i = 0; i < 44; i += 4) {
+    for (i = 0; i < 36; i += 4) {
         p[0] = tb * a[i + 0];
         p[1] = tb * a[i + 1];
         p[2] = tb * a[i + 2];
         p[3] = tb * a[i + 3];
         t += p[0];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 0] = (sp_digit)t2;
         t += p[1];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 1] = (sp_digit)t2;
         t += p[2];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 2] = (sp_digit)t2;
         t += p[3];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 3] = (sp_digit)t2;
     }
-    t += tb * a[44];
-    r[44] = (sp_digit)(t & 0x7fffff);
-    t >>= 23;
-    r[45] = (sp_digit)(t & 0x7fffff);
+    r[36] = (sp_digit)(t & 0x1fffffff);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
@@ -1691,19 +1826,19 @@ SP_NOINLINE static void sp_2048_mul_d_45(sp_digit* r, const sp_digit* a,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static void sp_2048_cond_add_45(sp_digit* r, const sp_digit* a,
+static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 45; i++) {
+    for (i = 0; i < 36; i++) {
         r[i] = a[i] + (b[i] & m);
     }
 #else
     int i;
 
-    for (i = 0; i < 40; i += 8) {
+    for (i = 0; i < 32; i += 8) {
         r[i + 0] = a[i + 0] + (b[i + 0] & m);
         r[i + 1] = a[i + 1] + (b[i + 1] & m);
         r[i + 2] = a[i + 2] + (b[i + 2] & m);
@@ -1713,71 +1848,165 @@ static void sp_2048_cond_add_45(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] + (b[i + 6] & m);
         r[i + 7] = a[i + 7] + (b[i + 7] & m);
     }
-    r[40] = a[40] + (b[40] & m);
-    r[41] = a[41] + (b[41] & m);
-    r[42] = a[42] + (b[42] & m);
-    r[43] = a[43] + (b[43] & m);
-    r[44] = a[44] + (b[44] & m);
+    r[32] = a[32] + (b[32] & m);
+    r[33] = a[33] + (b[33] & m);
+    r[34] = a[34] + (b[34] & m);
+    r[35] = a[35] + (b[35] & m);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-SP_NOINLINE static void sp_2048_rshift_45(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_rshift_36(sp_digit* r, const sp_digit* a,
         byte n)
 {
     int i;
 
 #ifdef WOLFSSL_SP_SMALL
-    for (i=0; i<44; i++) {
-        r[i] = ((a[i] >> n) | (a[i + 1] << (23 - n))) & 0x7fffff;
+    for (i=0; i<35; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
     }
 #else
-    for (i=0; i<40; i += 8) {
-        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (23 - n)) & 0x7fffff);
-        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (23 - n)) & 0x7fffff);
-        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (23 - n)) & 0x7fffff);
-        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (23 - n)) & 0x7fffff);
-        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (23 - n)) & 0x7fffff);
-        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (23 - n)) & 0x7fffff);
-        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (23 - n)) & 0x7fffff);
-        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (23 - n)) & 0x7fffff);
+    for (i=0; i<32; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (29 - n)) & 0x1fffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (29 - n)) & 0x1fffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (29 - n)) & 0x1fffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (29 - n)) & 0x1fffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (29 - n)) & 0x1fffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (29 - n)) & 0x1fffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (29 - n)) & 0x1fffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (29 - n)) & 0x1fffffff);
     }
-    r[40] = (a[40] >> n) | ((a[41] << (23 - n)) & 0x7fffff);
-    r[41] = (a[41] >> n) | ((a[42] << (23 - n)) & 0x7fffff);
-    r[42] = (a[42] >> n) | ((a[43] << (23 - n)) & 0x7fffff);
-    r[43] = (a[43] >> n) | ((a[44] << (23 - n)) & 0x7fffff);
-#endif
-    r[44] = a[44] >> n;
+    r[32] = (a[32] >> n) | ((a[33] << (29 - n)) & 0x1fffffff);
+    r[33] = (a[33] >> n) | ((a[34] << (29 - n)) & 0x1fffffff);
+    r[34] = (a[34] >> n) | ((a[35] << (29 - n)) & 0x1fffffff);
+#endif /* WOLFSSL_SP_SMALL */
+    r[35] = a[35] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_32
-static WC_INLINE sp_digit sp_2048_div_word_45(sp_digit d1, sp_digit d0,
+static WC_INLINE sp_digit sp_2048_div_word_36(sp_digit d1, sp_digit d0,
     sp_digit dv)
 {
     sp_digit d;
     sp_digit r;
     sp_digit t;
 
-    /* All 23 bits from d1 and top 8 bits from d0. */
-    d = (d1 << 8) + (d0 >> 15);
+    /* All 29 bits from d1 and top 2 bits from d0. */
+    d = (d1 << 2) + (d0 >> 27);
     r = d / dv;
     d -= r * dv;
+    /* Up to 3 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 25) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 5 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 23) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 21) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
     /* Up to 9 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 7) & ((1 << 8) - 1);
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 19) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 11 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 17) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 15) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 15 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 13) & ((1 << 2) - 1);
     t = d / dv;
     d -= t * dv;
     r += t;
     /* Up to 17 bits in r */
-    /* Remaining 7 bits from d0. */
-    r <<= 7;
-    d <<= 7;
-    d += d0 & ((1 << 7) - 1);
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 11) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 9) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 7) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 23 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 5) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 3) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 27 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 1) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
     t = d / dv;
     r += t;
 
-    /* All 23 bits from d1 and top 8 bits from d0. */
+    /* All 29 bits from d1 and top 2 bits from d0. */
     return r;
 }
 #endif /* WOLFSSL_SP_DIV_32 */
@@ -1793,19 +2022,19 @@ static WC_INLINE sp_digit sp_2048_div_word_45(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_2048_div_45(const sp_digit* a, const sp_digit* d,
+static int sp_2048_div_36(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
+    sp_int64 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[4 * 45 + 3];
+    sp_digit t1[4 * 36 + 3];
 #endif
     sp_digit* t2 = NULL;
     sp_digit* sd = NULL;
@@ -1814,7 +2043,7 @@ static int sp_2048_div_45(const sp_digit* a, const sp_digit* d,
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 45 + 3), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 36 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
@@ -1823,53 +2052,60 @@ static int sp_2048_div_45(const sp_digit* a, const sp_digit* d,
     (void)m;
 
     if (err == MP_OKAY) {
-        t2 = t1 + 90 + 1;
-        sd = t2 + 45 + 1;
+        t2 = t1 + 72 + 1;
+        sd = t2 + 36 + 1;
 
-        sp_2048_mul_d_45(sd, d, (sp_digit)1 << 11);
-        sp_2048_mul_d_90(t1, a, (sp_digit)1 << 11);
-        dv = sd[44];
-        t1[45 + 45] += t1[45 + 45 - 1] >> 23;
-        t1[45 + 45 - 1] &= 0x7fffff;
-        for (i=45; i>=0; i--) {
+        sp_2048_mul_d_36(sd, d, (sp_digit)1 << 20);
+        sp_2048_mul_d_72(t1, a, (sp_digit)1 << 20);
+        dv = sd[35];
+        t1[36 + 36] += t1[36 + 36 - 1] >> 29;
+        t1[36 + 36 - 1] &= 0x1fffffff;
+        for (i=36; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_32
-            d1 = t1[45 + i];
-            d1 <<= 23;
-            d1 += t1[45 + i - 1];
+            d1 = t1[36 + i];
+            d1 <<= 29;
+            d1 += t1[36 + i - 1];
             r1 = (sp_digit)(d1 / dv);
 #else
-            r1 = sp_2048_div_word_45(t1[45 + i], t1[45 + i - 1], dv);
+            r1 = sp_2048_div_word_36(t1[36 + i], t1[36 + i - 1], dv);
 #endif
 
-            sp_2048_mul_d_45(t2, sd, r1);
-            (void)sp_2048_sub_45(&t1[i], &t1[i], t2);
-            sp_2048_norm_45(&t1[i]);
-            t1[45 + i] -= t2[45];
-            t1[45 + i] += t1[45 + i - 1] >> 23;
-            t1[45 + i - 1] &= 0x7fffff;
-            r1 = (((-t1[45 + i]) << 23) - t1[45 + i - 1]) / dv;
-            r1 -= t1[45 + i];
-            sp_2048_mul_d_45(t2, sd, r1);
-            (void)sp_2048_add_45(&t1[i], &t1[i], t2);
-            t1[45 + i] += t1[45 + i - 1] >> 23;
-            t1[45 + i - 1] &= 0x7fffff;
+            sp_2048_mul_d_36(t2, sd, r1);
+            (void)sp_2048_sub_36(&t1[i], &t1[i], t2);
+            sp_2048_norm_36(&t1[i]);
+            t1[36 + i] -= t2[36];
+            t1[36 + i] += t1[36 + i - 1] >> 29;
+            t1[36 + i - 1] &= 0x1fffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[36 + i];
+            d1 <<= 29;
+            d1 -= t1[36 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_2048_div_word_36(-t1[36 + i], -t1[36 + i - 1], dv);
+#endif
+            r1 -= t1[36 + i];
+            sp_2048_mul_d_36(t2, sd, r1);
+            (void)sp_2048_add_36(&t1[i], &t1[i], t2);
+            t1[36 + i] += t1[36 + i - 1] >> 29;
+            t1[36 + i - 1] &= 0x1fffffff;
         }
-        t1[45 - 1] += t1[45 - 2] >> 23;
-        t1[45 - 2] &= 0x7fffff;
-        r1 = t1[45 - 1] / dv;
+        t1[36 - 1] += t1[36 - 2] >> 29;
+        t1[36 - 2] &= 0x1fffffff;
+        r1 = t1[36 - 1] / dv;
 
-        sp_2048_mul_d_45(t2, sd, r1);
-        sp_2048_sub_45(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 90U);
-        for (i=0; i<44; i++) {
-            r[i+1] += r[i] >> 23;
-            r[i] &= 0x7fffff;
+        sp_2048_mul_d_36(t2, sd, r1);
+        sp_2048_sub_36(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 72U);
+        for (i=0; i<35; i++) {
+            r[i+1] += r[i] >> 29;
+            r[i] &= 0x1fffffff;
         }
-        sp_2048_cond_add_45(r, r, sd, 0 - ((r[44] < 0) ?
+        sp_2048_cond_add_36(r, r, sd, 0 - ((r[35] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
 
-        sp_2048_norm_45(r);
-        sp_2048_rshift_45(r, r, 11);
+        sp_2048_norm_36(r);
+        sp_2048_rshift_36(r, r, 20);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -1887,9 +2123,9 @@ static int sp_2048_div_45(const sp_digit* a, const sp_digit* d,
  * m  A single precision number that is the modulus to reduce with.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_2048_mod_45(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static int sp_2048_mod_36(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
-    return sp_2048_div_45(a, m, NULL, r);
+    return sp_2048_div_36(a, m, NULL, r);
 }
 
 /* Modular exponentiate a to the e mod m. (r = a^e mod m)
@@ -1901,14 +2137,14 @@ static int sp_2048_mod_45(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * m     A single precision number that is the modulus.
  * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
  */
-static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
+static int sp_2048_mod_exp_36(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int bits, const sp_digit* m, int reduceA)
 {
 #if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 90];
+    sp_digit td[3 * 72];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -1920,7 +2156,7 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 45 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 36 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -1929,29 +2165,29 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 45 * 2);
-            XMEMSET(t[i], 0, sizeof(sp_digit) * 45U * 2U);
+            t[i] = td + (i * 36 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 36U * 2U);
         }
 
         sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_45(norm, m);
+        sp_2048_mont_norm_36(norm, m);
 
         if (reduceA != 0) {
-            err = sp_2048_mod_45(t[1], a, m);
+            err = sp_2048_mod_36(t[1], a, m);
         }
         else {
-            XMEMCPY(t[1], a, sizeof(sp_digit) * 45U);
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 36U);
         }
     }
     if (err == MP_OKAY) {
-        sp_2048_mul_45(t[1], t[1], norm);
-        err = sp_2048_mod_45(t[1], t[1], m);
+        sp_2048_mul_36(t[1], t[1], norm);
+        err = sp_2048_mod_36(t[1], t[1], m);
     }
 
     if (err == MP_OKAY) {
-        i = bits / 23;
-        c = bits % 23;
-        n = e[i--] << (23 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -1959,28 +2195,28 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
                 }
 
                 n = e[i--];
-                c = 23;
+                c = 29;
             }
 
-            y = (int)((n >> 22) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_2048_mont_mul_45(t[y^1], t[0], t[1], m, mp);
+            sp_2048_mont_mul_36(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 45 * 2);
-            sp_2048_mont_sqr_45(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 36 * 2);
+            sp_2048_mont_sqr_36(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 45 * 2);
+                            sizeof(*t[2]) * 36 * 2);
         }
 
-        sp_2048_mont_reduce_45(t[0], m, mp);
-        n = sp_2048_cmp_45(t[0], m);
-        sp_2048_cond_sub_45(t[0], t[0], m, ((n < 0) ?
+        sp_2048_mont_reduce_36(t[0], m, mp);
+        n = sp_2048_cmp_36(t[0], m);
+        sp_2048_cond_sub_36(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 45 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 36 * 2);
 
     }
 
@@ -1994,7 +2230,7 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 90];
+    sp_digit td[3 * 72];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -2006,7 +2242,7 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 45 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 36 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -2015,29 +2251,29 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 45 * 2);
+            t[i] = td + (i * 36 * 2);
         }
 
         sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_45(norm, m);
+        sp_2048_mont_norm_36(norm, m);
 
         if (reduceA != 0) {
-            err = sp_2048_mod_45(t[1], a, m);
+            err = sp_2048_mod_36(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_2048_mul_45(t[1], t[1], norm);
-                err = sp_2048_mod_45(t[1], t[1], m);
+                sp_2048_mul_36(t[1], t[1], norm);
+                err = sp_2048_mod_36(t[1], t[1], m);
             }
         }
         else {
-            sp_2048_mul_45(t[1], a, norm);
-            err = sp_2048_mod_45(t[1], t[1], m);
+            sp_2048_mul_36(t[1], a, norm);
+            err = sp_2048_mod_36(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        i = bits / 23;
-        c = bits % 23;
-        n = e[i--] << (23 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -2045,28 +2281,28 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
                 }
 
                 n = e[i--];
-                c = 23;
+                c = 29;
             }
 
-            y = (int)((n >> 22) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_2048_mont_mul_45(t[y^1], t[0], t[1], m, mp);
+            sp_2048_mont_mul_36(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 45 * 2);
-            sp_2048_mont_sqr_45(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 36 * 2);
+            sp_2048_mont_sqr_36(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 45 * 2);
+                            sizeof(*t[2]) * 36 * 2);
         }
 
-        sp_2048_mont_reduce_45(t[0], m, mp);
-        n = sp_2048_cmp_45(t[0], m);
-        sp_2048_cond_sub_45(t[0], t[0], m, ((n < 0) ?
+        sp_2048_mont_reduce_36(t[0], m, mp);
+        n = sp_2048_cmp_36(t[0], m);
+        sp_2048_cond_sub_36(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 45 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 36 * 2);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -2079,7 +2315,7 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[(32 * 90) + 90];
+    sp_digit td[(32 * 72) + 72];
 #endif
     sp_digit* t[32];
     sp_digit* rt = NULL;
@@ -2092,7 +2328,7 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 90) + 90), NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 72) + 72), NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -2101,64 +2337,64 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<32; i++)
-            t[i] = td + i * 90;
-        rt = td + 2880;
+            t[i] = td + i * 72;
+        rt = td + 2304;
 
         sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_45(norm, m);
+        sp_2048_mont_norm_36(norm, m);
 
         if (reduceA != 0) {
-            err = sp_2048_mod_45(t[1], a, m);
+            err = sp_2048_mod_36(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_2048_mul_45(t[1], t[1], norm);
-                err = sp_2048_mod_45(t[1], t[1], m);
+                sp_2048_mul_36(t[1], t[1], norm);
+                err = sp_2048_mod_36(t[1], t[1], m);
             }
         }
         else {
-            sp_2048_mul_45(t[1], a, norm);
-            err = sp_2048_mod_45(t[1], t[1], m);
+            sp_2048_mul_36(t[1], a, norm);
+            err = sp_2048_mod_36(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        sp_2048_mont_sqr_45(t[ 2], t[ 1], m, mp);
-        sp_2048_mont_mul_45(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_2048_mont_sqr_45(t[ 4], t[ 2], m, mp);
-        sp_2048_mont_mul_45(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_2048_mont_sqr_45(t[ 6], t[ 3], m, mp);
-        sp_2048_mont_mul_45(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_2048_mont_sqr_45(t[ 8], t[ 4], m, mp);
-        sp_2048_mont_mul_45(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_2048_mont_sqr_45(t[10], t[ 5], m, mp);
-        sp_2048_mont_mul_45(t[11], t[ 6], t[ 5], m, mp);
-        sp_2048_mont_sqr_45(t[12], t[ 6], m, mp);
-        sp_2048_mont_mul_45(t[13], t[ 7], t[ 6], m, mp);
-        sp_2048_mont_sqr_45(t[14], t[ 7], m, mp);
-        sp_2048_mont_mul_45(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_45(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_45(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_45(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_45(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_45(t[20], t[10], m, mp);
-        sp_2048_mont_mul_45(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_45(t[22], t[11], m, mp);
-        sp_2048_mont_mul_45(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_45(t[24], t[12], m, mp);
-        sp_2048_mont_mul_45(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_45(t[26], t[13], m, mp);
-        sp_2048_mont_mul_45(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_45(t[28], t[14], m, mp);
-        sp_2048_mont_mul_45(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_45(t[30], t[15], m, mp);
-        sp_2048_mont_mul_45(t[31], t[16], t[15], m, mp);
+        sp_2048_mont_sqr_36(t[ 2], t[ 1], m, mp);
+        sp_2048_mont_mul_36(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_2048_mont_sqr_36(t[ 4], t[ 2], m, mp);
+        sp_2048_mont_mul_36(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_2048_mont_sqr_36(t[ 6], t[ 3], m, mp);
+        sp_2048_mont_mul_36(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_2048_mont_sqr_36(t[ 8], t[ 4], m, mp);
+        sp_2048_mont_mul_36(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_2048_mont_sqr_36(t[10], t[ 5], m, mp);
+        sp_2048_mont_mul_36(t[11], t[ 6], t[ 5], m, mp);
+        sp_2048_mont_sqr_36(t[12], t[ 6], m, mp);
+        sp_2048_mont_mul_36(t[13], t[ 7], t[ 6], m, mp);
+        sp_2048_mont_sqr_36(t[14], t[ 7], m, mp);
+        sp_2048_mont_mul_36(t[15], t[ 8], t[ 7], m, mp);
+        sp_2048_mont_sqr_36(t[16], t[ 8], m, mp);
+        sp_2048_mont_mul_36(t[17], t[ 9], t[ 8], m, mp);
+        sp_2048_mont_sqr_36(t[18], t[ 9], m, mp);
+        sp_2048_mont_mul_36(t[19], t[10], t[ 9], m, mp);
+        sp_2048_mont_sqr_36(t[20], t[10], m, mp);
+        sp_2048_mont_mul_36(t[21], t[11], t[10], m, mp);
+        sp_2048_mont_sqr_36(t[22], t[11], m, mp);
+        sp_2048_mont_mul_36(t[23], t[12], t[11], m, mp);
+        sp_2048_mont_sqr_36(t[24], t[12], m, mp);
+        sp_2048_mont_mul_36(t[25], t[13], t[12], m, mp);
+        sp_2048_mont_sqr_36(t[26], t[13], m, mp);
+        sp_2048_mont_mul_36(t[27], t[14], t[13], m, mp);
+        sp_2048_mont_sqr_36(t[28], t[14], m, mp);
+        sp_2048_mont_mul_36(t[29], t[15], t[14], m, mp);
+        sp_2048_mont_sqr_36(t[30], t[15], m, mp);
+        sp_2048_mont_mul_36(t[31], t[16], t[15], m, mp);
 
         bits = ((bits + 4) / 5) * 5;
-        i = ((bits + 22) / 23) - 1;
-        c = bits % 23;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
         if (c == 0) {
-            c = 23;
+            c = 29;
         }
-        if (i < 45) {
+        if (i < 36) {
             n = e[i--] << (32 - c);
         }
         else {
@@ -2166,36 +2402,48 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
             i--;
         }
         if (c < 5) {
-            n |= e[i--] << (9 - c);
-            c += 23;
+            n |= e[i--] << (3 - c);
+            c += 29;
         }
         y = (int)((n >> 27) & 0x1f);
         n <<= 5;
         c -= 5;
-        XMEMCPY(rt, t[y], sizeof(sp_digit) * 90);
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 72);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (9 - c);
-                c += 23;
+            if (c >= 5) {
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c = 24;
+            }
+            else {
+                y = (byte)((n >> 27) & 0x1f);
+                n = e[i--] << 3;
+                c = 5 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
             }
-            y = (int)((n >> 27) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
-            sp_2048_mont_sqr_45(rt, rt, m, mp);
-            sp_2048_mont_sqr_45(rt, rt, m, mp);
-            sp_2048_mont_sqr_45(rt, rt, m, mp);
-            sp_2048_mont_sqr_45(rt, rt, m, mp);
-            sp_2048_mont_sqr_45(rt, rt, m, mp);
+            sp_2048_mont_sqr_36(rt, rt, m, mp);
+            sp_2048_mont_sqr_36(rt, rt, m, mp);
+            sp_2048_mont_sqr_36(rt, rt, m, mp);
+            sp_2048_mont_sqr_36(rt, rt, m, mp);
+            sp_2048_mont_sqr_36(rt, rt, m, mp);
 
-            sp_2048_mont_mul_45(rt, rt, t[y], m, mp);
+            sp_2048_mont_mul_36(rt, rt, t[y], m, mp);
         }
 
-        sp_2048_mont_reduce_45(rt, m, mp);
-        n = sp_2048_cmp_45(rt, m);
-        sp_2048_cond_sub_45(rt, rt, m, ((n < 0) ?
+        sp_2048_mont_reduce_36(rt, m, mp);
+        n = sp_2048_cmp_36(rt, m);
+        sp_2048_cond_sub_36(rt, rt, m, ((n < 0) ?
                    (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, rt, sizeof(sp_digit) * 90);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 72);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -2215,34 +2463,40 @@ static int sp_2048_mod_exp_45(sp_digit* r, const sp_digit* a, const sp_digit* e,
  * r  A single precision number.
  * m  A single precision number.
  */
-static void sp_2048_mont_norm_90(sp_digit* r, const sp_digit* m)
+static void sp_2048_mont_norm_72(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=0; i<89; i++) {
-        r[i] = 0x7fffff;
+    for (i=0; i<70; i++) {
+        r[i] = 0x1fffffff;
     }
 #else
     int i;
 
-    for (i = 0; i < 88; i += 8) {
-        r[i + 0] = 0x7fffff;
-        r[i + 1] = 0x7fffff;
-        r[i + 2] = 0x7fffff;
-        r[i + 3] = 0x7fffff;
-        r[i + 4] = 0x7fffff;
-        r[i + 5] = 0x7fffff;
-        r[i + 6] = 0x7fffff;
-        r[i + 7] = 0x7fffff;
+    for (i = 0; i < 64; i += 8) {
+        r[i + 0] = 0x1fffffff;
+        r[i + 1] = 0x1fffffff;
+        r[i + 2] = 0x1fffffff;
+        r[i + 3] = 0x1fffffff;
+        r[i + 4] = 0x1fffffff;
+        r[i + 5] = 0x1fffffff;
+        r[i + 6] = 0x1fffffff;
+        r[i + 7] = 0x1fffffff;
     }
-    r[88] = 0x7fffff;
-#endif
-    r[89] = 0x1L;
+    r[64] = 0x1fffffff;
+    r[65] = 0x1fffffff;
+    r[66] = 0x1fffffff;
+    r[67] = 0x1fffffff;
+    r[68] = 0x1fffffff;
+    r[69] = 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+    r[70] = 0x3ffffL;
+    r[71] = 0;
 
     /* r = (2^n - 1) mod n */
-    (void)sp_2048_sub_90(r, r, m);
+    (void)sp_2048_sub_72(r, r, m);
 
     /* Add one so r = 2^n mod m */
     r[0] += 1;
@@ -2255,21 +2509,19 @@ static void sp_2048_mont_norm_90(sp_digit* r, const sp_digit* m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static sp_digit sp_2048_cmp_90(const sp_digit* a, const sp_digit* b)
+static sp_digit sp_2048_cmp_72(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=89; i>=0; i--) {
+    for (i=71; i>=0; i--) {
         r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
 #else
     int i;
 
-    r |= (a[89] - b[89]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[88] - b[88]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    for (i = 80; i >= 0; i -= 8) {
+    for (i = 64; i >= 0; i -= 8) {
         r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
@@ -2292,19 +2544,19 @@ static sp_digit sp_2048_cmp_90(const sp_digit* a, const sp_digit* b)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static void sp_2048_cond_sub_90(sp_digit* r, const sp_digit* a,
+static void sp_2048_cond_sub_72(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 90; i++) {
+    for (i = 0; i < 72; i++) {
         r[i] = a[i] - (b[i] & m);
     }
 #else
     int i;
 
-    for (i = 0; i < 88; i += 8) {
+    for (i = 0; i < 72; i += 8) {
         r[i + 0] = a[i + 0] - (b[i + 0] & m);
         r[i + 1] = a[i + 1] - (b[i + 1] & m);
         r[i + 2] = a[i + 2] - (b[i + 2] & m);
@@ -2314,8 +2566,6 @@ static void sp_2048_cond_sub_90(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] - (b[i + 6] & m);
         r[i + 7] = a[i + 7] - (b[i + 7] & m);
     }
-    r[88] = a[88] - (b[88] & m);
-    r[89] = a[89] - (b[89] & m);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
@@ -2325,76 +2575,112 @@ static void sp_2048_cond_sub_90(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_2048_mul_add_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_add_72(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 90; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x7fffff;
-        t >>= 23;
+    for (i = 0; i < 72; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0x1fffffff;
+        t >>= 29;
     }
-    r[90] += (sp_digit)t;
+    r[72] += (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t[8];
+#ifdef WOLFSSL_SP_SMALL
+    sp_int64 tb = b;
+    sp_int64 t[4];
     int i;
 
-    t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x7fffff);
-    for (i = 0; i < 88; i += 8) {
-        t[1] = tb * a[i+1];
-        r[i+1] += (sp_digit)((t[0] >> 23) + (t[1] & 0x7fffff));
-        t[2] = tb * a[i+2];
-        r[i+2] += (sp_digit)((t[1] >> 23) + (t[2] & 0x7fffff));
-        t[3] = tb * a[i+3];
-        r[i+3] += (sp_digit)((t[2] >> 23) + (t[3] & 0x7fffff));
-        t[4] = tb * a[i+4];
-        r[i+4] += (sp_digit)((t[3] >> 23) + (t[4] & 0x7fffff));
-        t[5] = tb * a[i+5];
-        r[i+5] += (sp_digit)((t[4] >> 23) + (t[5] & 0x7fffff));
-        t[6] = tb * a[i+6];
-        r[i+6] += (sp_digit)((t[5] >> 23) + (t[6] & 0x7fffff));
-        t[7] = tb * a[i+7];
-        r[i+7] += (sp_digit)((t[6] >> 23) + (t[7] & 0x7fffff));
-        t[0] = tb * a[i+8];
-        r[i+8] += (sp_digit)((t[7] >> 23) + (t[0] & 0x7fffff));
+    t[0] = 0;
+    for (i = 0; i < 68; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[0]  = t[3] >> 29;
     }
-    t[1] = tb * a[89];
-    r[89] += (sp_digit)((t[0] >> 23) + (t[1] & 0x7fffff));
-    r[90] +=  (sp_digit)(t[1] >> 23);
+    t[0] += (tb * a[68]) + r[68];
+    t[1]  = (tb * a[69]) + r[69];
+    t[2]  = (tb * a[70]) + r[70];
+    t[3]  = (tb * a[71]) + r[71];
+    r[68] = t[0] & 0x1fffffff;
+    t[1] += t[0] >> 29;
+    r[69] = t[1] & 0x1fffffff;
+    t[2] += t[1] >> 29;
+    r[70] = t[2] & 0x1fffffff;
+    t[3] += t[2] >> 29;
+    r[71] = t[3] & 0x1fffffff;
+    r[72] +=  (sp_digit)(t[3] >> 29);
+#else
+    sp_int64 tb = b;
+    sp_int64 t[8];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 64; i += 8) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        t[4]  = (tb * a[i+4]) + r[i+4];
+        t[5]  = (tb * a[i+5]) + r[i+5];
+        t[6]  = (tb * a[i+6]) + r[i+6];
+        t[7]  = (tb * a[i+7]) + r[i+7];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[4] += t[3] >> 29;
+        r[i+4] = t[4] & 0x1fffffff;
+        t[5] += t[4] >> 29;
+        r[i+5] = t[5] & 0x1fffffff;
+        t[6] += t[5] >> 29;
+        r[i+6] = t[6] & 0x1fffffff;
+        t[7] += t[6] >> 29;
+        r[i+7] = t[7] & 0x1fffffff;
+        t[0]  = t[7] >> 29;
+    }
+    t[0] += (tb * a[64]) + r[64];
+    t[1]  = (tb * a[65]) + r[65];
+    t[2]  = (tb * a[66]) + r[66];
+    t[3]  = (tb * a[67]) + r[67];
+    t[4]  = (tb * a[68]) + r[68];
+    t[5]  = (tb * a[69]) + r[69];
+    t[6]  = (tb * a[70]) + r[70];
+    t[7]  = (tb * a[71]) + r[71];
+    r[64] = t[0] & 0x1fffffff;
+    t[1] += t[0] >> 29;
+    r[65] = t[1] & 0x1fffffff;
+    t[2] += t[1] >> 29;
+    r[66] = t[2] & 0x1fffffff;
+    t[3] += t[2] >> 29;
+    r[67] = t[3] & 0x1fffffff;
+    t[4] += t[3] >> 29;
+    r[68] = t[4] & 0x1fffffff;
+    t[5] += t[4] >> 29;
+    r[69] = t[5] & 0x1fffffff;
+    t[6] += t[5] >> 29;
+    r[70] = t[6] & 0x1fffffff;
+    t[7] += t[6] >> 29;
+    r[71] = t[7] & 0x1fffffff;
+    r[72] +=  (sp_digit)(t[7] >> 29);
 #endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 23.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_2048_norm_90(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 89; i++) {
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
-    }
-#else
-    int i;
-    for (i = 0; i < 88; i += 8) {
-        a[i+1] += a[i+0] >> 23; a[i+0] &= 0x7fffff;
-        a[i+2] += a[i+1] >> 23; a[i+1] &= 0x7fffff;
-        a[i+3] += a[i+2] >> 23; a[i+2] &= 0x7fffff;
-        a[i+4] += a[i+3] >> 23; a[i+3] &= 0x7fffff;
-        a[i+5] += a[i+4] >> 23; a[i+4] &= 0x7fffff;
-        a[i+6] += a[i+5] >> 23; a[i+5] &= 0x7fffff;
-        a[i+7] += a[i+6] >> 23; a[i+6] &= 0x7fffff;
-        a[i+8] += a[i+7] >> 23; a[i+7] &= 0x7fffff;
-    }
-    a[88+1] += a[88] >> 23; a[88] &= 0x7fffff;
-#endif
+#endif /* !WOLFSSL_SP_LARGE_CODE */
 }
 
 /* Shift the result in the high 2048 bits down to the bottom.
@@ -2402,45 +2688,50 @@ static void sp_2048_norm_90(sp_digit* a)
  * r  A single precision number.
  * a  A single precision number.
  */
-static void sp_2048_mont_shift_90(sp_digit* r, const sp_digit* a)
+static void sp_2048_mont_shift_72(sp_digit* r, const sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
-    int64_t n = a[89] >> 1;
-    n += ((int64_t)a[90]) << 22;
+    sp_int64 n = a[70] >> 18;
+    n += ((sp_int64)a[71]) << 11;
 
-    for (i = 0; i < 89; i++) {
-        r[i] = n & 0x7fffff;
-        n >>= 23;
-        n += ((int64_t)a[91 + i]) << 22;
+    for (i = 0; i < 70; i++) {
+        r[i] = n & 0x1fffffff;
+        n >>= 29;
+        n += ((sp_int64)a[72 + i]) << 11;
     }
-    r[89] = (sp_digit)n;
+    r[70] = (sp_digit)n;
 #else
     int i;
-    int64_t n = a[89] >> 1;
-    n += ((int64_t)a[90]) << 22;
-    for (i = 0; i < 88; i += 8) {
-        r[i + 0] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 91]) << 22;
-        r[i + 1] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 92]) << 22;
-        r[i + 2] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 93]) << 22;
-        r[i + 3] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 94]) << 22;
-        r[i + 4] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 95]) << 22;
-        r[i + 5] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 96]) << 22;
-        r[i + 6] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 97]) << 22;
-        r[i + 7] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 98]) << 22;
+    sp_int64 n = a[70] >> 18;
+    n += ((sp_int64)a[71]) << 11;
+    for (i = 0; i < 64; i += 8) {
+        r[i + 0] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 72]) << 11;
+        r[i + 1] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 73]) << 11;
+        r[i + 2] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 74]) << 11;
+        r[i + 3] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 75]) << 11;
+        r[i + 4] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 76]) << 11;
+        r[i + 5] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 77]) << 11;
+        r[i + 6] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 78]) << 11;
+        r[i + 7] = n & 0x1fffffff;
+        n >>= 29; n += ((sp_int64)a[i + 79]) << 11;
     }
-    r[88] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[179]) << 22;
-    r[89] = (sp_digit)n;
+    r[64] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[136]) << 11;
+    r[65] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[137]) << 11;
+    r[66] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[138]) << 11;
+    r[67] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[139]) << 11;
+    r[68] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[140]) << 11;
+    r[69] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[141]) << 11;
+    r[70] = (sp_digit)n;
 #endif /* WOLFSSL_SP_SMALL */
-    XMEMSET(&r[90], 0, sizeof(*r) * 90U);
+    XMEMSET(&r[71], 0, sizeof(*r) * 71U);
 }
 
 /* Reduce the number back to 2048 bits using Montgomery reduction.
@@ -2449,51 +2740,51 @@ static void sp_2048_mont_shift_90(sp_digit* r, const sp_digit* a)
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static void sp_2048_mont_reduce_90(sp_digit* a, const sp_digit* m, sp_digit mp)
+static void sp_2048_mont_reduce_72(sp_digit* a, const sp_digit* m, sp_digit mp)
 {
     int i;
     sp_digit mu;
 
-    sp_2048_norm_90(a + 90);
+    sp_2048_norm_72(a + 71);
 
 #ifdef WOLFSSL_SP_DH
     if (mp != 1) {
-        for (i=0; i<89; i++) {
-            mu = (a[i] * mp) & 0x7fffff;
-            sp_2048_mul_add_90(a+i, m, mu);
-            a[i+1] += a[i] >> 23;
+        for (i=0; i<70; i++) {
+            mu = (a[i] * mp) & 0x1fffffff;
+            sp_2048_mul_add_72(a+i, m, mu);
+            a[i+1] += a[i] >> 29;
         }
-        mu = (a[i] * mp) & 0x1L;
-        sp_2048_mul_add_90(a+i, m, mu);
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
+        mu = (a[i] * mp) & 0x3ffffL;
+        sp_2048_mul_add_72(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
     }
     else {
-        for (i=0; i<89; i++) {
-            mu = a[i] & 0x7fffff;
-            sp_2048_mul_add_90(a+i, m, mu);
-            a[i+1] += a[i] >> 23;
+        for (i=0; i<70; i++) {
+            mu = a[i] & 0x1fffffff;
+            sp_2048_mul_add_72(a+i, m, mu);
+            a[i+1] += a[i] >> 29;
         }
-        mu = a[i] & 0x1L;
-        sp_2048_mul_add_90(a+i, m, mu);
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
+        mu = a[i] & 0x3ffffL;
+        sp_2048_mul_add_72(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
     }
 #else
-    for (i=0; i<89; i++) {
-        mu = (a[i] * mp) & 0x7fffff;
-        sp_2048_mul_add_90(a+i, m, mu);
-        a[i+1] += a[i] >> 23;
+    for (i=0; i<70; i++) {
+        mu = (a[i] * mp) & 0x1fffffff;
+        sp_2048_mul_add_72(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
     }
-    mu = (a[i] * mp) & 0x1L;
-    sp_2048_mul_add_90(a+i, m, mu);
-    a[i+1] += a[i] >> 23;
-    a[i] &= 0x7fffff;
+    mu = (a[i] * mp) & 0x3ffffL;
+    sp_2048_mul_add_72(a+i, m, mu);
+    a[i+1] += a[i] >> 29;
+    a[i] &= 0x1fffffff;
 #endif
-    sp_2048_mont_shift_90(a, a);
-    sp_2048_cond_sub_90(a, a, m, 0 - (((a[89] >> 1) > 0) ?
+    sp_2048_mont_shift_72(a, a);
+    sp_2048_cond_sub_72(a, a, m, 0 - (((a[70] - m[70]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
-    sp_2048_norm_90(a);
+    sp_2048_norm_72(a);
 }
 
 /* Multiply two Montogmery form numbers mod the modulus (prime).
@@ -2505,11 +2796,11 @@ static void sp_2048_mont_reduce_90(sp_digit* a, const sp_digit* m, sp_digit mp)
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_2048_mont_mul_90(sp_digit* r, const sp_digit* a,
+static void sp_2048_mont_mul_72(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit* m, sp_digit mp)
 {
-    sp_2048_mul_90(r, a, b);
-    sp_2048_mont_reduce_90(r, m, mp);
+    sp_2048_mul_72(r, a, b);
+    sp_2048_mont_reduce_72(r, m, mp);
 }
 
 /* Square the Montgomery form number. (r = a * a mod m)
@@ -2519,11 +2810,44 @@ static void sp_2048_mont_mul_90(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_2048_mont_sqr_90(sp_digit* r, const sp_digit* a,
+static void sp_2048_mont_sqr_72(sp_digit* r, const sp_digit* a,
         const sp_digit* m, sp_digit mp)
 {
-    sp_2048_sqr_90(r, a);
-    sp_2048_mont_reduce_90(r, m, mp);
+    sp_2048_sqr_72(r, a);
+    sp_2048_mont_reduce_72(r, m, mp);
+}
+
+/* Normalize the values in each word to 29 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_71(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 70; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 64; i += 8) {
+        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
+        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
+        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
+        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
+        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
+        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
+        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
+        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
+    }
+    a[65] += a[64] >> 29; a[64] &= 0x1fffffff;
+    a[66] += a[65] >> 29; a[65] &= 0x1fffffff;
+    a[67] += a[66] >> 29; a[66] &= 0x1fffffff;
+    a[68] += a[67] >> 29; a[67] &= 0x1fffffff;
+    a[69] += a[68] >> 29; a[68] &= 0x1fffffff;
+    a[70] += a[69] >> 29; a[69] &= 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -2532,50 +2856,50 @@ static void sp_2048_mont_sqr_90(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_2048_mul_d_180(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_d_144(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 180; i++) {
+    for (i = 0; i < 144; i++) {
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
     }
-    r[180] = (sp_digit)t;
+    r[144] = (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     sp_digit t2;
-    int64_t p[4];
+    sp_int64 p[4];
     int i;
 
-    for (i = 0; i < 180; i += 4) {
+    for (i = 0; i < 144; i += 4) {
         p[0] = tb * a[i + 0];
         p[1] = tb * a[i + 1];
         p[2] = tb * a[i + 2];
         p[3] = tb * a[i + 3];
         t += p[0];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 0] = (sp_digit)t2;
         t += p[1];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 1] = (sp_digit)t2;
         t += p[2];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 2] = (sp_digit)t2;
         t += p[3];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        t2 = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
         r[i + 3] = (sp_digit)t2;
     }
-    r[180] = (sp_digit)(t & 0x7fffff);
+    r[144] = (sp_digit)(t & 0x1fffffff);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
@@ -2587,19 +2911,19 @@ SP_NOINLINE static void sp_2048_mul_d_180(sp_digit* r, const sp_digit* a,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static void sp_2048_cond_add_90(sp_digit* r, const sp_digit* a,
+static void sp_2048_cond_add_72(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 90; i++) {
+    for (i = 0; i < 71; i++) {
         r[i] = a[i] + (b[i] & m);
     }
 #else
     int i;
 
-    for (i = 0; i < 88; i += 8) {
+    for (i = 0; i < 64; i += 8) {
         r[i + 0] = a[i + 0] + (b[i + 0] & m);
         r[i + 1] = a[i + 1] + (b[i + 1] & m);
         r[i + 2] = a[i + 2] + (b[i + 2] & m);
@@ -2609,65 +2933,172 @@ static void sp_2048_cond_add_90(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] + (b[i + 6] & m);
         r[i + 7] = a[i + 7] + (b[i + 7] & m);
     }
-    r[88] = a[88] + (b[88] & m);
-    r[89] = a[89] + (b[89] & m);
+    r[64] = a[64] + (b[64] & m);
+    r[65] = a[65] + (b[65] & m);
+    r[66] = a[66] + (b[66] & m);
+    r[67] = a[67] + (b[67] & m);
+    r[68] = a[68] + (b[68] & m);
+    r[69] = a[69] + (b[69] & m);
+    r[70] = a[70] + (b[70] & m);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-SP_NOINLINE static void sp_2048_rshift_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_rshift_72(sp_digit* r, const sp_digit* a,
         byte n)
 {
     int i;
 
 #ifdef WOLFSSL_SP_SMALL
-    for (i=0; i<89; i++) {
-        r[i] = ((a[i] >> n) | (a[i + 1] << (23 - n))) & 0x7fffff;
+    for (i=0; i<71; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
     }
 #else
-    for (i=0; i<88; i += 8) {
-        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (23 - n)) & 0x7fffff);
-        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (23 - n)) & 0x7fffff);
-        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (23 - n)) & 0x7fffff);
-        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (23 - n)) & 0x7fffff);
-        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (23 - n)) & 0x7fffff);
-        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (23 - n)) & 0x7fffff);
-        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (23 - n)) & 0x7fffff);
-        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (23 - n)) & 0x7fffff);
+    for (i=0; i<64; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (29 - n)) & 0x1fffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (29 - n)) & 0x1fffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (29 - n)) & 0x1fffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (29 - n)) & 0x1fffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (29 - n)) & 0x1fffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (29 - n)) & 0x1fffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (29 - n)) & 0x1fffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (29 - n)) & 0x1fffffff);
     }
-    r[88] = (a[88] >> n) | ((a[89] << (23 - n)) & 0x7fffff);
-#endif
-    r[89] = a[89] >> n;
+    r[64] = (a[64] >> n) | ((a[65] << (29 - n)) & 0x1fffffff);
+    r[65] = (a[65] >> n) | ((a[66] << (29 - n)) & 0x1fffffff);
+    r[66] = (a[66] >> n) | ((a[67] << (29 - n)) & 0x1fffffff);
+    r[67] = (a[67] >> n) | ((a[68] << (29 - n)) & 0x1fffffff);
+    r[68] = (a[68] >> n) | ((a[69] << (29 - n)) & 0x1fffffff);
+    r[69] = (a[69] >> n) | ((a[70] << (29 - n)) & 0x1fffffff);
+    r[70] = (a[70] >> n) | ((a[71] << (29 - n)) & 0x1fffffff);
+#endif /* WOLFSSL_SP_SMALL */
+    r[71] = a[71] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_32
-static WC_INLINE sp_digit sp_2048_div_word_90(sp_digit d1, sp_digit d0,
+static WC_INLINE sp_digit sp_2048_div_word_72(sp_digit d1, sp_digit d0,
     sp_digit dv)
 {
     sp_digit d;
     sp_digit r;
     sp_digit t;
 
-    /* All 23 bits from d1 and top 8 bits from d0. */
-    d = (d1 << 8) + (d0 >> 15);
+    /* All 29 bits from d1 and top 2 bits from d0. */
+    d = (d1 << 2) + (d0 >> 27);
     r = d / dv;
     d -= r * dv;
+    /* Up to 3 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 25) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 5 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 23) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 21) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
     /* Up to 9 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 7) & ((1 << 8) - 1);
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 19) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 11 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 17) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 15) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 15 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 13) & ((1 << 2) - 1);
     t = d / dv;
     d -= t * dv;
     r += t;
     /* Up to 17 bits in r */
-    /* Remaining 7 bits from d0. */
-    r <<= 7;
-    d <<= 7;
-    d += d0 & ((1 << 7) - 1);
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 11) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 9) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 7) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 23 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 5) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 3) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 27 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 1) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
     t = d / dv;
     r += t;
 
-    /* All 23 bits from d1 and top 8 bits from d0. */
+    /* All 29 bits from d1 and top 2 bits from d0. */
     return r;
 }
 #endif /* WOLFSSL_SP_DIV_32 */
@@ -2683,19 +3114,19 @@ static WC_INLINE sp_digit sp_2048_div_word_90(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_2048_div_90(const sp_digit* a, const sp_digit* d,
+static int sp_2048_div_72(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
+    sp_int64 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[4 * 90 + 3];
+    sp_digit t1[4 * 72 + 3];
 #endif
     sp_digit* t2 = NULL;
     sp_digit* sd = NULL;
@@ -2704,7 +3135,7 @@ static int sp_2048_div_90(const sp_digit* a, const sp_digit* d,
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 90 + 3), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 72 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
@@ -2713,53 +3144,60 @@ static int sp_2048_div_90(const sp_digit* a, const sp_digit* d,
     (void)m;
 
     if (err == MP_OKAY) {
-        t2 = t1 + 180 + 1;
-        sd = t2 + 90 + 1;
+        t2 = t1 + 144 + 1;
+        sd = t2 + 72 + 1;
 
-        sp_2048_mul_d_90(sd, d, (sp_digit)1 << 22);
-        sp_2048_mul_d_180(t1, a, (sp_digit)1 << 22);
-        dv = sd[89];
-        t1[90 + 90] += t1[90 + 90 - 1] >> 23;
-        t1[90 + 90 - 1] &= 0x7fffff;
-        for (i=90; i>=0; i--) {
+        sp_2048_mul_d_72(sd, d, (sp_digit)1 << 11);
+        sp_2048_mul_d_144(t1, a, (sp_digit)1 << 11);
+        dv = sd[70];
+        t1[71 + 71] += t1[71 + 71 - 1] >> 29;
+        t1[71 + 71 - 1] &= 0x1fffffff;
+        for (i=71; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_32
-            d1 = t1[90 + i];
-            d1 <<= 23;
-            d1 += t1[90 + i - 1];
+            d1 = t1[71 + i];
+            d1 <<= 29;
+            d1 += t1[71 + i - 1];
             r1 = (sp_digit)(d1 / dv);
 #else
-            r1 = sp_2048_div_word_90(t1[90 + i], t1[90 + i - 1], dv);
+            r1 = sp_2048_div_word_72(t1[71 + i], t1[71 + i - 1], dv);
 #endif
 
-            sp_2048_mul_d_90(t2, sd, r1);
-            (void)sp_2048_sub_90(&t1[i], &t1[i], t2);
-            sp_2048_norm_90(&t1[i]);
-            t1[90 + i] -= t2[90];
-            t1[90 + i] += t1[90 + i - 1] >> 23;
-            t1[90 + i - 1] &= 0x7fffff;
-            r1 = (((-t1[90 + i]) << 23) - t1[90 + i - 1]) / dv;
-            r1 -= t1[90 + i];
-            sp_2048_mul_d_90(t2, sd, r1);
-            (void)sp_2048_add_90(&t1[i], &t1[i], t2);
-            t1[90 + i] += t1[90 + i - 1] >> 23;
-            t1[90 + i - 1] &= 0x7fffff;
+            sp_2048_mul_d_72(t2, sd, r1);
+            (void)sp_2048_sub_72(&t1[i], &t1[i], t2);
+            sp_2048_norm_71(&t1[i]);
+            t1[71 + i] += t1[71 + i - 1] >> 29;
+            t1[71 + i - 1] &= 0x1fffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[71 + i];
+            d1 <<= 29;
+            d1 -= t1[71 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_2048_div_word_72(-t1[71 + i], -t1[71 + i - 1], dv);
+#endif
+            r1 -= t1[71 + i];
+            sp_2048_mul_d_72(t2, sd, r1);
+            (void)sp_2048_add_72(&t1[i], &t1[i], t2);
+            t1[71 + i] += t1[71 + i - 1] >> 29;
+            t1[71 + i - 1] &= 0x1fffffff;
         }
-        t1[90 - 1] += t1[90 - 2] >> 23;
-        t1[90 - 2] &= 0x7fffff;
-        r1 = t1[90 - 1] / dv;
+        t1[71 - 1] += t1[71 - 2] >> 29;
+        t1[71 - 2] &= 0x1fffffff;
+        r1 = t1[71 - 1] / dv;
 
-        sp_2048_mul_d_90(t2, sd, r1);
-        sp_2048_sub_90(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 180U);
-        for (i=0; i<89; i++) {
-            r[i+1] += r[i] >> 23;
-            r[i] &= 0x7fffff;
+        sp_2048_mul_d_72(t2, sd, r1);
+        sp_2048_sub_72(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 144U);
+        for (i=0; i<70; i++) {
+            r[i+1] += r[i] >> 29;
+            r[i] &= 0x1fffffff;
         }
-        sp_2048_cond_add_90(r, r, sd, 0 - ((r[89] < 0) ?
+        sp_2048_cond_add_72(r, r, sd, 0 - ((r[70] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
 
-        sp_2048_norm_90(r);
-        sp_2048_rshift_90(r, r, 22);
+        sp_2048_norm_71(r);
+        sp_2048_rshift_72(r, r, 11);
+        r[71] = 0;
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -2777,9 +3215,9 @@ static int sp_2048_div_90(const sp_digit* a, const sp_digit* d,
  * m  A single precision number that is the modulus to reduce with.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_2048_mod_90(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static int sp_2048_mod_72(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
-    return sp_2048_div_90(a, m, NULL, r);
+    return sp_2048_div_72(a, m, NULL, r);
 }
 
 #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
@@ -2793,14 +3231,14 @@ static int sp_2048_mod_90(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * m     A single precision number that is the modulus.
  * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
  */
-static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
+static int sp_2048_mod_exp_72(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int bits, const sp_digit* m, int reduceA)
 {
 #if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 180];
+    sp_digit td[3 * 144];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -2812,7 +3250,7 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 90 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 72 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -2821,29 +3259,29 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 90 * 2);
-            XMEMSET(t[i], 0, sizeof(sp_digit) * 90U * 2U);
+            t[i] = td + (i * 72 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 72U * 2U);
         }
 
         sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_90(norm, m);
+        sp_2048_mont_norm_72(norm, m);
 
         if (reduceA != 0) {
-            err = sp_2048_mod_90(t[1], a, m);
+            err = sp_2048_mod_72(t[1], a, m);
         }
         else {
-            XMEMCPY(t[1], a, sizeof(sp_digit) * 90U);
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 72U);
         }
     }
     if (err == MP_OKAY) {
-        sp_2048_mul_90(t[1], t[1], norm);
-        err = sp_2048_mod_90(t[1], t[1], m);
+        sp_2048_mul_72(t[1], t[1], norm);
+        err = sp_2048_mod_72(t[1], t[1], m);
     }
 
     if (err == MP_OKAY) {
-        i = bits / 23;
-        c = bits % 23;
-        n = e[i--] << (23 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -2851,28 +3289,28 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
                 }
 
                 n = e[i--];
-                c = 23;
+                c = 29;
             }
 
-            y = (int)((n >> 22) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_2048_mont_mul_90(t[y^1], t[0], t[1], m, mp);
+            sp_2048_mont_mul_72(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 90 * 2);
-            sp_2048_mont_sqr_90(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 72 * 2);
+            sp_2048_mont_sqr_72(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 90 * 2);
+                            sizeof(*t[2]) * 72 * 2);
         }
 
-        sp_2048_mont_reduce_90(t[0], m, mp);
-        n = sp_2048_cmp_90(t[0], m);
-        sp_2048_cond_sub_90(t[0], t[0], m, ((n < 0) ?
+        sp_2048_mont_reduce_72(t[0], m, mp);
+        n = sp_2048_cmp_72(t[0], m);
+        sp_2048_cond_sub_72(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 90 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 72 * 2);
 
     }
 
@@ -2886,7 +3324,7 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 180];
+    sp_digit td[3 * 144];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -2898,7 +3336,7 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 90 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 72 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -2907,29 +3345,29 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 90 * 2);
+            t[i] = td + (i * 72 * 2);
         }
 
         sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_90(norm, m);
+        sp_2048_mont_norm_72(norm, m);
 
         if (reduceA != 0) {
-            err = sp_2048_mod_90(t[1], a, m);
+            err = sp_2048_mod_72(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_2048_mul_90(t[1], t[1], norm);
-                err = sp_2048_mod_90(t[1], t[1], m);
+                sp_2048_mul_72(t[1], t[1], norm);
+                err = sp_2048_mod_72(t[1], t[1], m);
             }
         }
         else {
-            sp_2048_mul_90(t[1], a, norm);
-            err = sp_2048_mod_90(t[1], t[1], m);
+            sp_2048_mul_72(t[1], a, norm);
+            err = sp_2048_mod_72(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        i = bits / 23;
-        c = bits % 23;
-        n = e[i--] << (23 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -2937,28 +3375,28 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
                 }
 
                 n = e[i--];
-                c = 23;
+                c = 29;
             }
 
-            y = (int)((n >> 22) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_2048_mont_mul_90(t[y^1], t[0], t[1], m, mp);
+            sp_2048_mont_mul_72(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 90 * 2);
-            sp_2048_mont_sqr_90(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 72 * 2);
+            sp_2048_mont_sqr_72(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 90 * 2);
+                            sizeof(*t[2]) * 72 * 2);
         }
 
-        sp_2048_mont_reduce_90(t[0], m, mp);
-        n = sp_2048_cmp_90(t[0], m);
-        sp_2048_cond_sub_90(t[0], t[0], m, ((n < 0) ?
+        sp_2048_mont_reduce_72(t[0], m, mp);
+        n = sp_2048_cmp_72(t[0], m);
+        sp_2048_cond_sub_72(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 90 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 72 * 2);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -2971,9 +3409,9 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[(32 * 180) + 180];
+    sp_digit td[(16 * 144) + 144];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm = NULL;
     sp_digit mp = 1;
@@ -2984,7 +3422,7 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 180) + 180), NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 144) + 144), NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -2992,102 +3430,97 @@ static int sp_2048_mod_exp_90(sp_digit* r, const sp_digit* a, const sp_digit* e,
 
     if (err == MP_OKAY) {
         norm = td;
-        for (i=0; i<32; i++)
-            t[i] = td + i * 180;
-        rt = td + 5760;
+        for (i=0; i<16; i++)
+            t[i] = td + i * 144;
+        rt = td + 2304;
 
         sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_90(norm, m);
+        sp_2048_mont_norm_72(norm, m);
 
         if (reduceA != 0) {
-            err = sp_2048_mod_90(t[1], a, m);
+            err = sp_2048_mod_72(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_2048_mul_90(t[1], t[1], norm);
-                err = sp_2048_mod_90(t[1], t[1], m);
+                sp_2048_mul_72(t[1], t[1], norm);
+                err = sp_2048_mod_72(t[1], t[1], m);
             }
         }
         else {
-            sp_2048_mul_90(t[1], a, norm);
-            err = sp_2048_mod_90(t[1], t[1], m);
+            sp_2048_mul_72(t[1], a, norm);
+            err = sp_2048_mod_72(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        sp_2048_mont_sqr_90(t[ 2], t[ 1], m, mp);
-        sp_2048_mont_mul_90(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_2048_mont_sqr_90(t[ 4], t[ 2], m, mp);
-        sp_2048_mont_mul_90(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_2048_mont_sqr_90(t[ 6], t[ 3], m, mp);
-        sp_2048_mont_mul_90(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_2048_mont_sqr_90(t[ 8], t[ 4], m, mp);
-        sp_2048_mont_mul_90(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_2048_mont_sqr_90(t[10], t[ 5], m, mp);
-        sp_2048_mont_mul_90(t[11], t[ 6], t[ 5], m, mp);
-        sp_2048_mont_sqr_90(t[12], t[ 6], m, mp);
-        sp_2048_mont_mul_90(t[13], t[ 7], t[ 6], m, mp);
-        sp_2048_mont_sqr_90(t[14], t[ 7], m, mp);
-        sp_2048_mont_mul_90(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_90(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_90(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_90(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_90(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_90(t[20], t[10], m, mp);
-        sp_2048_mont_mul_90(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_90(t[22], t[11], m, mp);
-        sp_2048_mont_mul_90(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_90(t[24], t[12], m, mp);
-        sp_2048_mont_mul_90(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_90(t[26], t[13], m, mp);
-        sp_2048_mont_mul_90(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_90(t[28], t[14], m, mp);
-        sp_2048_mont_mul_90(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_90(t[30], t[15], m, mp);
-        sp_2048_mont_mul_90(t[31], t[16], t[15], m, mp);
+        sp_2048_mont_sqr_72(t[ 2], t[ 1], m, mp);
+        sp_2048_mont_mul_72(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_2048_mont_sqr_72(t[ 4], t[ 2], m, mp);
+        sp_2048_mont_mul_72(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_2048_mont_sqr_72(t[ 6], t[ 3], m, mp);
+        sp_2048_mont_mul_72(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_2048_mont_sqr_72(t[ 8], t[ 4], m, mp);
+        sp_2048_mont_mul_72(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_2048_mont_sqr_72(t[10], t[ 5], m, mp);
+        sp_2048_mont_mul_72(t[11], t[ 6], t[ 5], m, mp);
+        sp_2048_mont_sqr_72(t[12], t[ 6], m, mp);
+        sp_2048_mont_mul_72(t[13], t[ 7], t[ 6], m, mp);
+        sp_2048_mont_sqr_72(t[14], t[ 7], m, mp);
+        sp_2048_mont_mul_72(t[15], t[ 8], t[ 7], m, mp);
 
-        bits = ((bits + 4) / 5) * 5;
-        i = ((bits + 22) / 23) - 1;
-        c = bits % 23;
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
         if (c == 0) {
-            c = 23;
+            c = 29;
         }
-        if (i < 90) {
+        if (i < 72) {
             n = e[i--] << (32 - c);
         }
         else {
             n = 0;
             i--;
         }
-        if (c < 5) {
-            n |= e[i--] << (9 - c);
-            c += 23;
+        if (c < 4) {
+            n |= e[i--] << (3 - c);
+            c += 29;
         }
-        y = (int)((n >> 27) & 0x1f);
-        n <<= 5;
-        c -= 5;
-        XMEMCPY(rt, t[y], sizeof(sp_digit) * 180);
-        while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (9 - c);
-                c += 23;
+        y = (int)((n >> 28) & 0xf);
+        n <<= 4;
+        c -= 4;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 144);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 25;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 3;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
             }
-            y = (int)((n >> 27) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
-            sp_2048_mont_sqr_90(rt, rt, m, mp);
-            sp_2048_mont_sqr_90(rt, rt, m, mp);
-            sp_2048_mont_sqr_90(rt, rt, m, mp);
-            sp_2048_mont_sqr_90(rt, rt, m, mp);
-            sp_2048_mont_sqr_90(rt, rt, m, mp);
+            sp_2048_mont_sqr_72(rt, rt, m, mp);
+            sp_2048_mont_sqr_72(rt, rt, m, mp);
+            sp_2048_mont_sqr_72(rt, rt, m, mp);
+            sp_2048_mont_sqr_72(rt, rt, m, mp);
 
-            sp_2048_mont_mul_90(rt, rt, t[y], m, mp);
+            sp_2048_mont_mul_72(rt, rt, t[y], m, mp);
         }
 
-        sp_2048_mont_reduce_90(rt, m, mp);
-        n = sp_2048_cmp_90(rt, m);
-        sp_2048_cond_sub_90(rt, rt, m, ((n < 0) ?
+        sp_2048_mont_reduce_72(rt, m, mp);
+        n = sp_2048_cmp_72(rt, m);
+        sp_2048_cond_sub_72(rt, rt, m, ((n < 0) ?
                    (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, rt, sizeof(sp_digit) * 180);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 144);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -3121,7 +3554,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[90 * 5];
+    sp_digit a[72 * 5];
 #endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
@@ -3136,7 +3569,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        if (mp_count_bits(em) > 23) {
+        if (mp_count_bits(em) > 29) {
             err = MP_READ_E;
         }
         else if (inLen > 256U) {
@@ -3152,7 +3585,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 90 * 5, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 5, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -3160,12 +3593,12 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
 #endif
 
     if (err == MP_OKAY) {
-        r = a + 90 * 2;
-        m = r + 90 * 2;
+        r = a + 72 * 2;
+        m = r + 72 * 2;
         norm = r;
 
-        sp_2048_from_bin(a, 90, in, inLen);
-#if DIGIT_BIT >= 23
+        sp_2048_from_bin(a, 72, in, inLen);
+#if DIGIT_BIT >= 29
         e[0] = (sp_digit)em->dp[0];
 #else
         e[0] = (sp_digit)em->dp[0];
@@ -3179,36 +3612,36 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_from_mp(m, 90, mm);
+        sp_2048_from_mp(m, 72, mm);
 
         sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_90(norm, m);
+        sp_2048_mont_norm_72(norm, m);
     }
     if (err == MP_OKAY) {
-        sp_2048_mul_90(a, a, norm);
-        err = sp_2048_mod_90(a, a, m);
+        sp_2048_mul_72(a, a, norm);
+        err = sp_2048_mod_72(a, a, m);
     }
     if (err == MP_OKAY) {
-        for (i=22; i>=0; i--) {
+        for (i=28; i>=0; i--) {
             if ((e[0] >> i) != 0) {
                 break;
             }
         }
 
-        XMEMCPY(r, a, sizeof(sp_digit) * 90 * 2);
+        XMEMCPY(r, a, sizeof(sp_digit) * 72 * 2);
         for (i--; i>=0; i--) {
-            sp_2048_mont_sqr_90(r, r, m, mp);
+            sp_2048_mont_sqr_72(r, r, m, mp);
 
             if (((e[0] >> i) & 1) == 1) {
-                sp_2048_mont_mul_90(r, r, a, m, mp);
+                sp_2048_mont_mul_72(r, r, a, m, mp);
             }
         }
-        sp_2048_mont_reduce_90(r, m, mp);
-        mp = sp_2048_cmp_90(r, m);
-        sp_2048_cond_sub_90(r, r, m, ((mp < 0) ?
+        sp_2048_mont_reduce_72(r, m, mp);
+        mp = sp_2048_cmp_72(r, m);
+        sp_2048_cond_sub_72(r, r, m, ((mp < 0) ?
                     (sp_digit)1 : (sp_digit)0)- 1);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_72(r, out);
         *outLen = 256;
     }
 
@@ -3222,7 +3655,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit d[90 * 5];
+    sp_digit d[72 * 5];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -3234,7 +3667,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
         err = MP_TO_E;
     }
     if (err == MP_OKAY) {
-        if (mp_count_bits(em) > 23) {
+        if (mp_count_bits(em) > 29) {
             err = MP_READ_E;
         }
         else if (inLen > 256U) {
@@ -3250,7 +3683,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 90 * 5, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 5, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -3259,11 +3692,11 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
 
     if (err == MP_OKAY) {
         a = d;
-        r = a + 90 * 2;
-        m = r + 90 * 2;
+        r = a + 72 * 2;
+        m = r + 72 * 2;
 
-        sp_2048_from_bin(a, 90, in, inLen);
-#if DIGIT_BIT >= 23
+        sp_2048_from_bin(a, 72, in, inLen);
+#if DIGIT_BIT >= 29
         e[0] = (sp_digit)em->dp[0];
 #else
         e[0] = (sp_digit)em->dp[0];
@@ -3276,14 +3709,14 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
         }
     }
     if (err == MP_OKAY) {
-        sp_2048_from_mp(m, 90, mm);
+        sp_2048_from_mp(m, 72, mm);
 
         if (e[0] == 0x3) {
-            sp_2048_sqr_90(r, a);
-            err = sp_2048_mod_90(r, r, m);
+            sp_2048_sqr_72(r, a);
+            err = sp_2048_mod_72(r, r, m);
             if (err == MP_OKAY) {
-                sp_2048_mul_90(r, a, r);
-                err = sp_2048_mod_90(r, r, m);
+                sp_2048_mul_72(r, a, r);
+                err = sp_2048_mod_72(r, r, m);
             }
         }
         else {
@@ -3292,36 +3725,36 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
             sp_digit mp;
 
             sp_2048_mont_setup(m, &mp);
-            sp_2048_mont_norm_90(norm, m);
+            sp_2048_mont_norm_72(norm, m);
 
-            sp_2048_mul_90(a, a, norm);
-            err = sp_2048_mod_90(a, a, m);
+            sp_2048_mul_72(a, a, norm);
+            err = sp_2048_mod_72(a, a, m);
 
             if (err == MP_OKAY) {
-                for (i=22; i>=0; i--) {
+                for (i=28; i>=0; i--) {
                     if ((e[0] >> i) != 0) {
                         break;
                     }
                 }
 
-                XMEMCPY(r, a, sizeof(sp_digit) * 180U);
+                XMEMCPY(r, a, sizeof(sp_digit) * 144U);
                 for (i--; i>=0; i--) {
-                    sp_2048_mont_sqr_90(r, r, m, mp);
+                    sp_2048_mont_sqr_72(r, r, m, mp);
 
                     if (((e[0] >> i) & 1) == 1) {
-                        sp_2048_mont_mul_90(r, r, a, m, mp);
+                        sp_2048_mont_mul_72(r, r, a, m, mp);
                     }
                 }
-                sp_2048_mont_reduce_90(r, m, mp);
-                mp = sp_2048_cmp_90(r, m);
-                sp_2048_cond_sub_90(r, r, m, ((mp < 0) ?
+                sp_2048_mont_reduce_72(r, m, mp);
+                mp = sp_2048_cmp_72(r, m);
+                sp_2048_cond_sub_72(r, r, m, ((mp < 0) ?
                            (sp_digit)1 : (sp_digit)0) - 1);
             }
         }
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_72(r, out);
         *outLen = 256;
     }
 
@@ -3363,7 +3796,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit  d[90 * 4];
+    sp_digit  d[72 * 4];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -3396,7 +3829,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 90 * 4, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -3404,18 +3837,18 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        a = d + 90;
-        m = a + 180;
+        a = d + 72;
+        m = a + 144;
         r = a;
 
-        sp_2048_from_bin(a, 90, in, inLen);
-        sp_2048_from_mp(d, 90, dm);
-        sp_2048_from_mp(m, 90, mm);
-        err = sp_2048_mod_exp_90(r, a, d, 2048, m, 0);
+        sp_2048_from_bin(a, 72, in, inLen);
+        sp_2048_from_mp(d, 72, dm);
+        sp_2048_from_mp(m, 72, mm);
+        err = sp_2048_mod_exp_72(r, a, d, 2048, m, 0);
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_72(r, out);
         *outLen = 256;
     }
 
@@ -3425,7 +3858,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     {
         /* only "a" and "r" are sensitive and need zeroized (same pointer) */
         if (a != NULL)
-            ForceZero(a, sizeof(sp_digit) * 90);
+            ForceZero(a, sizeof(sp_digit) * 72);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(d, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -3436,7 +3869,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit d[90 * 4];
+    sp_digit d[72 * 4];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -3469,7 +3902,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 90 * 4, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -3477,18 +3910,18 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        a = d + 90;
-        m = a + 180;
+        a = d + 72;
+        m = a + 144;
         r = a;
 
-        sp_2048_from_bin(a, 90, in, inLen);
-        sp_2048_from_mp(d, 90, dm);
-        sp_2048_from_mp(m, 90, mm);
-        err = sp_2048_mod_exp_90(r, a, d, 2048, m, 0);
+        sp_2048_from_bin(a, 72, in, inLen);
+        sp_2048_from_mp(d, 72, dm);
+        sp_2048_from_mp(m, 72, mm);
+        err = sp_2048_mod_exp_72(r, a, d, 2048, m, 0);
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_72(r, out);
         *outLen = 256;
     }
 
@@ -3498,7 +3931,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     {
         /* only "a" and "r" are sensitive and need zeroized (same pointer) */
         if (a != NULL)
-            ForceZero(a, sizeof(sp_digit) * 90);
+            ForceZero(a, sizeof(sp_digit) * 72);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(d, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -3511,10 +3944,9 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[45 * 11];
+    sp_digit a[36 * 8];
 #endif
     sp_digit* p = NULL;
-    sp_digit* q = NULL;
     sp_digit* dp = NULL;
     sp_digit* dq = NULL;
     sp_digit* qi = NULL;
@@ -3543,47 +3975,48 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 45 * 11, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 8, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
     }
 #endif
     if (err == MP_OKAY) {
-        p = a + 90 * 2;
-        q = p + 45;
-        qi = dq = dp = q + 45;
-        tmpa = qi + 45;
-        tmpb = tmpa + 90;
-        r = a + 90;
+        p = a + 72;
+        qi = dq = dp = p + 36;
+        tmpa = qi + 36;
+        tmpb = tmpa + 72;
+        r = a;
 
-        sp_2048_from_bin(a, 90, in, inLen);
-        sp_2048_from_mp(p, 45, pm);
-        sp_2048_from_mp(q, 45, qm);
-        sp_2048_from_mp(dp, 45, dpm);
-        err = sp_2048_mod_exp_45(tmpa, a, dp, 1024, p, 1);
+        sp_2048_from_bin(a, 72, in, inLen);
+        sp_2048_from_mp(p, 36, pm);
+        sp_2048_from_mp(dp, 36, dpm);
+        err = sp_2048_mod_exp_36(tmpa, a, dp, 1024, p, 1);
     }
     if (err == MP_OKAY) {
-        sp_2048_from_mp(dq, 45, dqm);
-        err = sp_2048_mod_exp_45(tmpb, a, dq, 1024, q, 1);
+        sp_2048_from_mp(p, 36, qm);
+        sp_2048_from_mp(dq, 36, dqm);
+        err = sp_2048_mod_exp_36(tmpb, a, dq, 1024, p, 1);
     }
     if (err == MP_OKAY) {
-        (void)sp_2048_sub_45(tmpa, tmpa, tmpb);
-        sp_2048_norm_45(tmpa);
-        sp_2048_cond_add_45(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[44] >> 31));
-        sp_2048_cond_add_45(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[44] >> 31));
+        sp_2048_from_mp(p, 36, pm);
+        (void)sp_2048_sub_36(tmpa, tmpa, tmpb);
+        sp_2048_norm_36(tmpa);
+        sp_2048_cond_add_36(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[35] >> 31));
+        sp_2048_cond_add_36(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[35] >> 31));
 
-        sp_2048_from_mp(qi, 45, qim);
-        sp_2048_mul_45(tmpa, tmpa, qi);
-        err = sp_2048_mod_45(tmpa, tmpa, p);
+        sp_2048_from_mp(qi, 36, qim);
+        sp_2048_mul_36(tmpa, tmpa, qi);
+        err = sp_2048_mod_36(tmpa, tmpa, p);
     }
 
     if (err == MP_OKAY) {
-        sp_2048_mul_45(tmpa, q, tmpa);
-        (void)sp_2048_add_90(r, tmpb, tmpa);
-        sp_2048_norm_90(r);
+        sp_2048_from_mp(p, 36, qm);
+        sp_2048_mul_36(tmpa, p, tmpa);
+        (void)sp_2048_add_72(r, tmpb, tmpa);
+        sp_2048_norm_72(r);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_72(r, out);
         *outLen = 256;
     }
 
@@ -3591,7 +4024,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 45 * 11);
+        ForceZero(a, sizeof(sp_digit) * 36 * 8);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -3602,7 +4035,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[45 * 13];
+    sp_digit a[36 * 13];
 #endif
     sp_digit* p = NULL;
     sp_digit* q = NULL;
@@ -3634,7 +4067,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 45 * 13, NULL, 
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 13, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -3642,43 +4075,43 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        p = a + 90 * 2;
-        q = p + 45;
-        dp = q + 45;
-        dq = dp + 45;
-        qi = dq + 45;
-        tmpa = qi + 45;
-        tmpb = tmpa + 90;
+        p = a + 72 * 2;
+        q = p + 36;
+        dp = q + 36;
+        dq = dp + 36;
+        qi = dq + 36;
+        tmpa = qi + 36;
+        tmpb = tmpa + 72;
         r = a;
 
-        sp_2048_from_bin(a, 90, in, inLen);
-        sp_2048_from_mp(p, 45, pm);
-        sp_2048_from_mp(q, 45, qm);
-        sp_2048_from_mp(dp, 45, dpm);
-        sp_2048_from_mp(dq, 45, dqm);
-        sp_2048_from_mp(qi, 45, qim);
+        sp_2048_from_bin(a, 72, in, inLen);
+        sp_2048_from_mp(p, 36, pm);
+        sp_2048_from_mp(q, 36, qm);
+        sp_2048_from_mp(dp, 36, dpm);
+        sp_2048_from_mp(dq, 36, dqm);
+        sp_2048_from_mp(qi, 36, qim);
 
-        err = sp_2048_mod_exp_45(tmpa, a, dp, 1024, p, 1);
+        err = sp_2048_mod_exp_36(tmpa, a, dp, 1024, p, 1);
     }
     if (err == MP_OKAY) {
-        err = sp_2048_mod_exp_45(tmpb, a, dq, 1024, q, 1);
+        err = sp_2048_mod_exp_36(tmpb, a, dq, 1024, q, 1);
     }
 
     if (err == MP_OKAY) {
-        (void)sp_2048_sub_45(tmpa, tmpa, tmpb);
-        sp_2048_norm_45(tmpa);
-        sp_2048_cond_add_45(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[44] >> 31));
-        sp_2048_cond_add_45(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[44] >> 31));
-        sp_2048_mul_45(tmpa, tmpa, qi);
-        err = sp_2048_mod_45(tmpa, tmpa, p);
+        (void)sp_2048_sub_36(tmpa, tmpa, tmpb);
+        sp_2048_norm_36(tmpa);
+        sp_2048_cond_add_36(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[35] >> 31));
+        sp_2048_cond_add_36(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[35] >> 31));
+        sp_2048_mul_36(tmpa, tmpa, qi);
+        err = sp_2048_mod_36(tmpa, tmpa, p);
     }
 
     if (err == MP_OKAY) {
-        sp_2048_mul_45(tmpa, tmpa, q);
-        (void)sp_2048_add_90(r, tmpb, tmpa);
-        sp_2048_norm_90(r);
+        sp_2048_mul_36(tmpa, tmpa, q);
+        (void)sp_2048_add_72(r, tmpb, tmpa);
+        sp_2048_norm_72(r);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_72(r, out);
         *outLen = 256;
     }
 
@@ -3686,7 +4119,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 45 * 13);
+        ForceZero(a, sizeof(sp_digit) * 36 * 13);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
     #endif
@@ -3712,22 +4145,22 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
 
     err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
     if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
-#if DIGIT_BIT == 23
-        XMEMCPY(r->dp, a, sizeof(sp_digit) * 90);
-        r->used = 90;
+#if DIGIT_BIT == 29
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 72);
+        r->used = 72;
         mp_clamp(r);
-#elif DIGIT_BIT < 23
+#elif DIGIT_BIT < 29
         int i;
         int j = 0;
         int s = 0;
 
         r->dp[0] = 0;
-        for (i = 0; i < 90; i++) {
+        for (i = 0; i < 72; i++) {
             r->dp[j] |= (mp_digit)(a[i] << s);
             r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
             s = DIGIT_BIT - s;
             r->dp[++j] = (mp_digit)(a[i] >> s);
-            while (s + DIGIT_BIT <= 23) {
+            while (s + DIGIT_BIT <= 29) {
                 s += DIGIT_BIT;
                 r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                 if (s == SP_WORD_SIZE) {
@@ -3737,7 +4170,7 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
                     r->dp[j] = (mp_digit)(a[i] >> s);
                 }
             }
-            s = 23 - s;
+            s = 29 - s;
         }
         r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
         mp_clamp(r);
@@ -3747,18 +4180,18 @@ static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
         int s = 0;
 
         r->dp[0] = 0;
-        for (i = 0; i < 90; i++) {
+        for (i = 0; i < 72; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
-            if (s + 23 >= DIGIT_BIT) {
+            if (s + 29 >= DIGIT_BIT) {
     #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;
                 r->dp[++j] = a[i] >> s;
-                s = 23 - s;
+                s = 29 - s;
             }
             else {
-                s += 23;
+                s += 29;
             }
         }
         r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
@@ -3786,7 +4219,7 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[90 * 4];
+    sp_digit b[72 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -3808,7 +4241,7 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 90 * 4, NULL,
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL,
             DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
@@ -3816,15 +4249,15 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 90 * 2;
-        m = e + 90;
+        e = b + 72 * 2;
+        m = e + 72;
         r = b;
 
-        sp_2048_from_mp(b, 90, base);
-        sp_2048_from_mp(e, 90, exp);
-        sp_2048_from_mp(m, 90, mod);
+        sp_2048_from_mp(b, 72, base);
+        sp_2048_from_mp(e, 72, exp);
+        sp_2048_from_mp(m, 72, mod);
 
-        err = sp_2048_mod_exp_90(r, b, e, mp_count_bits(exp), m, 0);
+        err = sp_2048_mod_exp_72(r, b, e, mp_count_bits(exp), m, 0);
     }
 
     if (err == MP_OKAY) {
@@ -3837,7 +4270,7 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 90U);
+            ForceZero(e, sizeof(sp_digit) * 72U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -3847,7 +4280,7 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[90 * 4];
+    sp_digit b[72 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -3870,22 +4303,22 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 90 * 4, NULL, DYNAMIC_TYPE_DH);
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL, DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
     }
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 90 * 2;
-        m = e + 90;
+        e = b + 72 * 2;
+        m = e + 72;
         r = b;
 
-        sp_2048_from_mp(b, 90, base);
-        sp_2048_from_mp(e, 90, exp);
-        sp_2048_from_mp(m, 90, mod);
+        sp_2048_from_mp(b, 72, base);
+        sp_2048_from_mp(e, 72, exp);
+        sp_2048_from_mp(m, 72, mod);
 
-        err = sp_2048_mod_exp_90(r, b, e, expBits, m, 0);
+        err = sp_2048_mod_exp_72(r, b, e, expBits, m, 0);
     }
 
     if (err == MP_OKAY) {
@@ -3899,7 +4332,7 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 90U);
+            ForceZero(e, sizeof(sp_digit) * 72U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -3912,202 +4345,166 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #ifdef WOLFSSL_HAVE_SP_DH
 
 #ifdef HAVE_FFDHE_2048
-SP_NOINLINE static void sp_2048_lshift_90(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_lshift_72(sp_digit* r, const sp_digit* a,
         byte n)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    r[90] = a[89] >> (23 - n);
-    for (i=89; i>0; i--) {
-        r[i] = ((a[i] << n) | (a[i-1] >> (23 - n))) & 0x7fffff;
+    r[72] = a[71] >> (29 - n);
+    for (i=71; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (29 - n))) & 0x1fffffff;
     }
 #else
     sp_int_digit s;
     sp_int_digit t;
 
-    s = (sp_int_digit)a[89];
-    r[90] = s >> (23U - n);
-    s = (sp_int_digit)(a[89]); t = (sp_int_digit)(a[88]);
-    r[89] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[88]); t = (sp_int_digit)(a[87]);
-    r[88] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[87]); t = (sp_int_digit)(a[86]);
-    r[87] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[86]); t = (sp_int_digit)(a[85]);
-    r[86] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[85]); t = (sp_int_digit)(a[84]);
-    r[85] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[84]); t = (sp_int_digit)(a[83]);
-    r[84] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[83]); t = (sp_int_digit)(a[82]);
-    r[83] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[82]); t = (sp_int_digit)(a[81]);
-    r[82] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[81]); t = (sp_int_digit)(a[80]);
-    r[81] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[80]); t = (sp_int_digit)(a[79]);
-    r[80] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[79]); t = (sp_int_digit)(a[78]);
-    r[79] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[78]); t = (sp_int_digit)(a[77]);
-    r[78] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[77]); t = (sp_int_digit)(a[76]);
-    r[77] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[76]); t = (sp_int_digit)(a[75]);
-    r[76] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[75]); t = (sp_int_digit)(a[74]);
-    r[75] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[74]); t = (sp_int_digit)(a[73]);
-    r[74] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[73]); t = (sp_int_digit)(a[72]);
-    r[73] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[72]); t = (sp_int_digit)(a[71]);
-    r[72] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    s = (sp_int_digit)a[71];
+    r[72] = s >> (29U - n);
     s = (sp_int_digit)(a[71]); t = (sp_int_digit)(a[70]);
-    r[71] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[71] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[70]); t = (sp_int_digit)(a[69]);
-    r[70] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[70] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[69]); t = (sp_int_digit)(a[68]);
-    r[69] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[69] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[68]); t = (sp_int_digit)(a[67]);
-    r[68] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[68] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[67]); t = (sp_int_digit)(a[66]);
-    r[67] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[67] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[66]); t = (sp_int_digit)(a[65]);
-    r[66] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[66] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[65]); t = (sp_int_digit)(a[64]);
-    r[65] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[65] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[64]); t = (sp_int_digit)(a[63]);
-    r[64] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[64] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[63]); t = (sp_int_digit)(a[62]);
-    r[63] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[63] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[62]); t = (sp_int_digit)(a[61]);
-    r[62] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[62] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[61]); t = (sp_int_digit)(a[60]);
-    r[61] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[61] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[60]); t = (sp_int_digit)(a[59]);
-    r[60] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[60] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[59]); t = (sp_int_digit)(a[58]);
-    r[59] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[59] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[58]); t = (sp_int_digit)(a[57]);
-    r[58] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[58] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[57]); t = (sp_int_digit)(a[56]);
-    r[57] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[57] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[56]); t = (sp_int_digit)(a[55]);
-    r[56] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[56] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[55]); t = (sp_int_digit)(a[54]);
-    r[55] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[55] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[54]); t = (sp_int_digit)(a[53]);
-    r[54] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[54] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[53]); t = (sp_int_digit)(a[52]);
-    r[53] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[53] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[52]); t = (sp_int_digit)(a[51]);
-    r[52] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[52] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[51]); t = (sp_int_digit)(a[50]);
-    r[51] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[51] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[50]); t = (sp_int_digit)(a[49]);
-    r[50] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[50] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[49]); t = (sp_int_digit)(a[48]);
-    r[49] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[49] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[48]); t = (sp_int_digit)(a[47]);
-    r[48] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[48] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[47]); t = (sp_int_digit)(a[46]);
-    r[47] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[47] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[46]); t = (sp_int_digit)(a[45]);
-    r[46] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[46] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[45]); t = (sp_int_digit)(a[44]);
-    r[45] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[45] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[44]); t = (sp_int_digit)(a[43]);
-    r[44] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[44] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[43]); t = (sp_int_digit)(a[42]);
-    r[43] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[43] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[42]); t = (sp_int_digit)(a[41]);
-    r[42] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[42] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[41]); t = (sp_int_digit)(a[40]);
-    r[41] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[41] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[40]); t = (sp_int_digit)(a[39]);
-    r[40] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[40] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[39]); t = (sp_int_digit)(a[38]);
-    r[39] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[39] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[38]); t = (sp_int_digit)(a[37]);
-    r[38] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[38] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[37]); t = (sp_int_digit)(a[36]);
-    r[37] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[37] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[36]); t = (sp_int_digit)(a[35]);
-    r[36] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[36] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[35]); t = (sp_int_digit)(a[34]);
-    r[35] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[35] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[34]); t = (sp_int_digit)(a[33]);
-    r[34] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[34] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[33]); t = (sp_int_digit)(a[32]);
-    r[33] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[33] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[32]); t = (sp_int_digit)(a[31]);
-    r[32] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[32] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[31]); t = (sp_int_digit)(a[30]);
-    r[31] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[31] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[30]); t = (sp_int_digit)(a[29]);
-    r[30] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[30] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
-    r[29] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[29] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
-    r[28] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[28] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
-    r[27] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[27] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
-    r[26] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[26] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
-    r[25] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[25] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
-    r[24] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[24] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
-    r[23] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[23] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
-    r[22] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[22] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
-    r[21] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[21] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
-    r[20] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[20] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
-    r[19] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[19] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
-    r[18] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[18] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
-    r[17] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[17] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
-    r[16] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[16] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
-    r[15] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[15] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
-    r[14] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[14] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
-    r[13] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[13] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
-    r[12] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[12] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
-    r[11] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[11] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
-    r[10] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[10] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
-    r[9] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[9] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
-    r[8] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[8] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
-    r[7] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[7] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
-    r[6] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[6] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
-    r[5] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[5] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
-    r[4] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[4] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
-    r[3] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[3] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
-    r[2] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
+    r[2] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
     s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
-    r[1] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-#endif
-    r[0] = (a[0] << n) & 0x7fffff;
+    r[1] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+    r[0] = (a[0] << n) & 0x1fffffff;
 }
 
 /* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
@@ -4118,12 +4515,12 @@ SP_NOINLINE static void sp_2048_lshift_90(sp_digit* r, const sp_digit* a,
  * m     A single precision number that is the modulus.
  * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
  */
-static int sp_2048_mod_exp_2_90(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
+static int sp_2048_mod_exp_2_72(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
 {
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[271];
+    sp_digit td[217];
 #endif
     sp_digit* norm = NULL;
     sp_digit* tmp = NULL;
@@ -4136,7 +4533,7 @@ static int sp_2048_mod_exp_2_90(sp_digit* r, const sp_digit* e, int bits, const
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 271, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 217, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -4144,19 +4541,19 @@ static int sp_2048_mod_exp_2_90(sp_digit* r, const sp_digit* e, int bits, const
 
     if (err == MP_OKAY) {
         norm = td;
-        tmp  = td + 180;
-        XMEMSET(td, 0, sizeof(sp_digit) * 271);
+        tmp  = td + 144;
+        XMEMSET(td, 0, sizeof(sp_digit) * 217);
 
         sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_90(norm, m);
+        sp_2048_mont_norm_72(norm, m);
 
         bits = ((bits + 3) / 4) * 4;
-        i = ((bits + 22) / 23) - 1;
-        c = bits % 23;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
         if (c == 0) {
-            c = 23;
+            c = 29;
         }
-        if (i < 90) {
+        if (i < 72) {
             n = e[i--] << (32 - c);
         }
         else {
@@ -4164,41 +4561,53 @@ static int sp_2048_mod_exp_2_90(sp_digit* r, const sp_digit* e, int bits, const
             i--;
         }
         if (c < 4) {
-            n |= e[i--] << (9 - c);
-            c += 23;
+            n |= e[i--] << (3 - c);
+            c += 29;
         }
         y = (int)((n >> 28) & 0xf);
         n <<= 4;
         c -= 4;
-        sp_2048_lshift_90(r, norm, (byte)y);
+        sp_2048_lshift_72(r, norm, (byte)y);
         while ((i >= 0) || (c >= 4)) {
-            if (c < 4) {
-                n |= e[i--] << (9 - c);
-                c += 23;
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 25;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 3;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
             }
-            y = (int)((n >> 28) & 0xf);
-            n <<= 4;
-            c -= 4;
 
-            sp_2048_mont_sqr_90(r, r, m, mp);
-            sp_2048_mont_sqr_90(r, r, m, mp);
-            sp_2048_mont_sqr_90(r, r, m, mp);
-            sp_2048_mont_sqr_90(r, r, m, mp);
+            sp_2048_mont_sqr_72(r, r, m, mp);
+            sp_2048_mont_sqr_72(r, r, m, mp);
+            sp_2048_mont_sqr_72(r, r, m, mp);
+            sp_2048_mont_sqr_72(r, r, m, mp);
 
-            sp_2048_lshift_90(r, r, (byte)y);
-            sp_2048_mul_d_90(tmp, norm, (r[90] << 22) + (r[89] >> 1));
-            r[90] = 0;
-            r[89] &= 0x1L;
-            (void)sp_2048_add_90(r, r, tmp);
-            sp_2048_norm_90(r);
-            o = sp_2048_cmp_90(r, m);
-            sp_2048_cond_sub_90(r, r, m, ((o < 0) ?
+            sp_2048_lshift_72(r, r, (byte)y);
+            sp_2048_mul_d_72(tmp, norm, (r[71] << 11) + (r[70] >> 18));
+            r[71] = 0;
+            r[70] &= 0x3ffffL;
+            (void)sp_2048_add_72(r, r, tmp);
+            sp_2048_norm_72(r);
+            o = sp_2048_cmp_72(r, m);
+            sp_2048_cond_sub_72(r, r, m, ((o < 0) ?
                                           (sp_digit)1 : (sp_digit)0) - 1);
         }
 
-        sp_2048_mont_reduce_90(r, m, mp);
-        n = sp_2048_cmp_90(r, m);
-        sp_2048_cond_sub_90(r, r, m, ((n < 0) ?
+        sp_2048_mont_reduce_72(r, m, mp);
+        n = sp_2048_cmp_72(r, m);
+        sp_2048_cond_sub_72(r, r, m, ((n < 0) ?
                                                 (sp_digit)1 : (sp_digit)0) - 1);
     }
 
@@ -4227,84 +4636,10 @@ static int sp_2048_mod_exp_2_90(sp_digit* r, const sp_digit* e, int bits, const
 int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     const mp_int* mod, byte* out, word32* outLen)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int err = MP_OKAY;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[90 * 4];
-#endif
-    sp_digit* e = NULL;
-    sp_digit* m = NULL;
-    sp_digit* r = NULL;
-    word32 i;
-
-    if (mp_count_bits(base) > 2048) {
-        err = MP_READ_E;
-    }
-    else if (expLen > 256) {
-        err = MP_READ_E;
-    }
-    else if (mp_count_bits(mod) != 2048) {
-        err = MP_READ_E;
-    }
-    else if (mp_iseven(mod)) {
-        err = MP_VAL;
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 90 * 4, NULL, DYNAMIC_TYPE_DH);
-        if (b == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        e = b + 90 * 2;
-        m = e + 90;
-        r = b;
-
-        sp_2048_from_mp(b, 90, base);
-        sp_2048_from_bin(e, 90, exp, expLen);
-        sp_2048_from_mp(m, 90, mod);
-
-    #ifdef HAVE_FFDHE_2048
-        if (base->used == 1 && base->dp[0] == 2 &&
-                ((m[89] << 15) | (m[88] >> 8)) == 0xffffL) {
-            err = sp_2048_mod_exp_2_90(r, e, expLen * 8, m);
-        }
-        else
-    #endif
-            err = sp_2048_mod_exp_90(r, b, e, expLen * 8, m, 0);
-    }
-
-    if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
-        *outLen = 256;
-        for (i=0; i<256 && out[i] == 0; i++) {
-        }
-        *outLen -= i;
-        XMEMMOVE(out, out + i, *outLen);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (b != NULL)
-#endif
-    {
-        /* only "e" is sensitive and needs zeroized */
-        if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 90U);
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        XFREE(b, NULL, DYNAMIC_TYPE_DH);
-    #endif
-    }
-    return err;
-#else
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* b = NULL;
-#else
-    sp_digit b[90 * 4];
+    sp_digit b[72 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -4327,7 +4662,7 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 90 * 4, NULL,
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL,
             DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
@@ -4335,29 +4670,29 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 90 * 2;
-        m = e + 90;
+        e = b + 72 * 2;
+        m = e + 72;
         r = b;
 
-        sp_2048_from_mp(b, 90, base);
-        sp_2048_from_bin(e, 90, exp, expLen);
-        sp_2048_from_mp(m, 90, mod);
+        sp_2048_from_mp(b, 72, base);
+        sp_2048_from_bin(e, 72, exp, expLen);
+        sp_2048_from_mp(m, 72, mod);
 
     #ifdef HAVE_FFDHE_2048
         if (base->used == 1 && base->dp[0] == 2U &&
-                ((m[89] << 15) | (m[88] >> 8)) == 0xffffL) {
-            err = sp_2048_mod_exp_2_90(r, e, expLen * 8U, m);
+                (m[70] >> 2) == 0xffffL) {
+            err = sp_2048_mod_exp_2_72(r, e, expLen * 8U, m);
         }
         else {
     #endif
-            err = sp_2048_mod_exp_90(r, b, e, expLen * 8U, m, 0);
+            err = sp_2048_mod_exp_72(r, b, e, expLen * 8U, m, 0);
     #ifdef HAVE_FFDHE_2048
         }
     #endif
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_72(r, out);
         *outLen = 256;
         for (i=0; i<256U && out[i] == 0U; i++) {
             /* Search for first non-zero. */
@@ -4372,14 +4707,13 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 90U);
+            ForceZero(e, sizeof(sp_digit) * 72U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
     }
 
     return err;
-#endif
 }
 #endif /* WOLFSSL_HAVE_SP_DH */
 
@@ -4400,7 +4734,7 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[45 * 4];
+    sp_digit b[36 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -4422,7 +4756,7 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 45 * 4, NULL,
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 4, NULL,
             DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
@@ -4430,19 +4764,19 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 45 * 2;
-        m = e + 45;
+        e = b + 36 * 2;
+        m = e + 36;
         r = b;
 
-        sp_2048_from_mp(b, 45, base);
-        sp_2048_from_mp(e, 45, exp);
-        sp_2048_from_mp(m, 45, mod);
+        sp_2048_from_mp(b, 36, base);
+        sp_2048_from_mp(e, 36, exp);
+        sp_2048_from_mp(m, 36, mod);
 
-        err = sp_2048_mod_exp_45(r, b, e, mp_count_bits(exp), m, 0);
+        err = sp_2048_mod_exp_36(r, b, e, mp_count_bits(exp), m, 0);
     }
 
     if (err == MP_OKAY) {
-        XMEMSET(r + 45, 0, sizeof(*r) * 45U);
+        XMEMSET(r + 36, 0, sizeof(*r) * 36U);
         err = sp_2048_to_mp(r, res);
     }
 
@@ -4452,7 +4786,7 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 90U);
+            ForceZero(e, sizeof(sp_digit) * 72U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -4462,7 +4796,7 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[45 * 4];
+    sp_digit b[36 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -4485,26 +4819,26 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 45 * 4, NULL, DYNAMIC_TYPE_DH);
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 4, NULL, DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
     }
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 45 * 2;
-        m = e + 45;
+        e = b + 36 * 2;
+        m = e + 36;
         r = b;
 
-        sp_2048_from_mp(b, 45, base);
-        sp_2048_from_mp(e, 45, exp);
-        sp_2048_from_mp(m, 45, mod);
+        sp_2048_from_mp(b, 36, base);
+        sp_2048_from_mp(e, 36, exp);
+        sp_2048_from_mp(m, 36, mod);
 
-        err = sp_2048_mod_exp_45(r, b, e, expBits, m, 0);
+        err = sp_2048_mod_exp_36(r, b, e, expBits, m, 0);
     }
 
     if (err == MP_OKAY) {
-        XMEMSET(r + 45, 0, sizeof(*r) * 45U);
+        XMEMSET(r + 36, 0, sizeof(*r) * 36U);
         err = sp_2048_to_mp(r, res);
     }
 
@@ -4515,7 +4849,7 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 90U);
+            ForceZero(e, sizeof(sp_digit) * 72U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -4530,6 +4864,7 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #endif /* !WOLFSSL_SP_NO_2048 */
 
 #ifndef WOLFSSL_SP_NO_3072
+#ifdef WOLFSSL_SP_SMALL
 /* Read big endian unsigned byte array into r.
  *
  * r  A single precision integer.
@@ -4546,9 +4881,9 @@ static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
     r[0] = 0;
     for (i = n-1; i >= 0; i--) {
         r[j] |= (((sp_digit)a[i]) << s);
-        if (s >= 15U) {
-            r[j] &= 0x7fffff;
-            s = 23U - s;
+        if (s >= 21U) {
+            r[j] &= 0x1fffffff;
+            s = 29U - s;
             if (j + 1 >= size) {
                 break;
             }
@@ -4573,7 +4908,7 @@ static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
  */
 static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
 {
-#if DIGIT_BIT == 23
+#if DIGIT_BIT == 29
     int j;
 
     XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);
@@ -4581,7 +4916,7 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
     for (j = a->used; j < size; j++) {
         r[j] = 0;
     }
-#elif DIGIT_BIT > 23
+#elif DIGIT_BIT > 29
     int i;
     int j = 0;
     word32 s = 0;
@@ -4589,16 +4924,16 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
     r[0] = 0;
     for (i = 0; i < a->used && j < size; i++) {
         r[j] |= ((sp_digit)a->dp[i] << s);
-        r[j] &= 0x7fffff;
-        s = 23U - s;
+        r[j] &= 0x1fffffff;
+        s = 29U - s;
         if (j + 1 >= size) {
             break;
         }
         /* lint allow cast of mismatch word32 and mp_digit */
         r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
-        while ((s + 23U) <= (word32)DIGIT_BIT) {
-            s += 23U;
-            r[j] &= 0x7fffff;
+        while ((s + 29U) <= (word32)DIGIT_BIT) {
+            s += 29U;
+            r[j] &= 0x1fffffff;
             if (j + 1 >= size) {
                 break;
             }
@@ -4624,12 +4959,12 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
     r[0] = 0;
     for (i = 0; i < a->used && j < size; i++) {
         r[j] |= ((sp_digit)a->dp[i]) << s;
-        if (s + DIGIT_BIT >= 23) {
-            r[j] &= 0x7fffff;
+        if (s + DIGIT_BIT >= 29) {
+            r[j] &= 0x1fffffff;
             if (j + 1 >= size) {
                 break;
             }
-            s = 23 - s;
+            s = 29 - s;
             if (s == DIGIT_BIT) {
                 r[++j] = 0;
                 s = 0;
@@ -4656,20 +4991,20 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_3072_to_bin(sp_digit* r, byte* a)
+static void sp_3072_to_bin_106(sp_digit* r, byte* a)
 {
     int i;
     int j;
     int s = 0;
     int b;
 
-    for (i=0; i<133; i++) {
-        r[i+1] += r[i] >> 23;
-        r[i] &= 0x7fffff;
+    for (i=0; i<105; i++) {
+        r[i+1] += r[i] >> 29;
+        r[i] &= 0x1fffffff;
     }
     j = 3072 / 8 - 1;
     a[j] = 0;
-    for (i=0; i<134 && j>=0; i++) {
+    for (i=0; i<106 && j>=0; i++) {
         b = 0;
         /* lint allow cast of mismatch sp_digit and int */
         a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
@@ -4677,14 +5012,14 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
         if (j < 0) {
             break;
         }
-        while (b < 23) {
+        while (b < 29) {
             a[j--] = (byte)(r[i] >> b);
             b += 8;
             if (j < 0) {
                 break;
             }
         }
-        s = 8 - (b - 23);
+        s = 8 - (b - 29);
         if (j >= 0) {
             a[j] = 0;
         }
@@ -4694,146 +5029,30 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
     }
 }
 
-#ifndef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
+/* Normalize the values in each word to 29 bits.
  *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
+ * a  Array of sp_digit to normalize.
  */
-SP_NOINLINE static void sp_3072_mul_67(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
+static void sp_3072_norm_53(sp_digit* a)
 {
     int i;
-    int j;
-    int64_t t[134];
-
-    XMEMSET(t, 0, sizeof(t));
-    for (i=0; i<67; i++) {
-        for (j=0; j<67; j++) {
-            t[i+j] += ((int64_t)a[i]) * b[j];
-        }
+    for (i = 0; i < 52; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
     }
-    for (i=0; i<133; i++) {
-        r[i] = t[i] & 0x7fffff;
-        t[i+1] += t[i] >> 23;
-    }
-    r[133] = (sp_digit)t[133];
 }
 
-/* Square a and put result in r. (r = a * a)
+/* Normalize the values in each word to 29 bits.
  *
- * r  A single precision integer.
- * a  A single precision integer.
+ * a  Array of sp_digit to normalize.
  */
-SP_NOINLINE static void sp_3072_sqr_67(sp_digit* r, const sp_digit* a)
+static void sp_3072_norm_106(sp_digit* a)
 {
     int i;
-    int j;
-    int64_t t[134];
-
-    XMEMSET(t, 0, sizeof(t));
-    for (i=0; i<67; i++) {
-        for (j=0; j<i; j++) {
-            t[i+j] += (((int64_t)a[i]) * a[j]) * 2;
-        }
-        t[i+i] += ((int64_t)a[i]) * a[i];
+    for (i = 0; i < 105; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
     }
-    for (i=0; i<133; i++) {
-        r[i] = t[i] & 0x7fffff;
-        t[i+1] += t[i] >> 23;
-    }
-    r[133] = (sp_digit)t[133];
-}
-
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_add_67(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 64; i += 8) {
-        r[i + 0] = a[i + 0] + b[i + 0];
-        r[i + 1] = a[i + 1] + b[i + 1];
-        r[i + 2] = a[i + 2] + b[i + 2];
-        r[i + 3] = a[i + 3] + b[i + 3];
-        r[i + 4] = a[i + 4] + b[i + 4];
-        r[i + 5] = a[i + 5] + b[i + 5];
-        r[i + 6] = a[i + 6] + b[i + 6];
-        r[i + 7] = a[i + 7] + b[i + 7];
-    }
-    r[64] = a[64] + b[64];
-    r[65] = a[65] + b[65];
-    r[66] = a[66] + b[66];
-
-    return 0;
-}
-
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_add_134(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 128; i += 8) {
-        r[i + 0] = a[i + 0] + b[i + 0];
-        r[i + 1] = a[i + 1] + b[i + 1];
-        r[i + 2] = a[i + 2] + b[i + 2];
-        r[i + 3] = a[i + 3] + b[i + 3];
-        r[i + 4] = a[i + 4] + b[i + 4];
-        r[i + 5] = a[i + 5] + b[i + 5];
-        r[i + 6] = a[i + 6] + b[i + 6];
-        r[i + 7] = a[i + 7] + b[i + 7];
-    }
-    r[128] = a[128] + b[128];
-    r[129] = a[129] + b[129];
-    r[130] = a[130] + b[130];
-    r[131] = a[131] + b[131];
-    r[132] = a[132] + b[132];
-    r[133] = a[133] + b[133];
-
-    return 0;
-}
-
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_sub_134(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 128; i += 8) {
-        r[i + 0] = a[i + 0] - b[i + 0];
-        r[i + 1] = a[i + 1] - b[i + 1];
-        r[i + 2] = a[i + 2] - b[i + 2];
-        r[i + 3] = a[i + 3] - b[i + 3];
-        r[i + 4] = a[i + 4] - b[i + 4];
-        r[i + 5] = a[i + 5] - b[i + 5];
-        r[i + 6] = a[i + 6] - b[i + 6];
-        r[i + 7] = a[i + 7] - b[i + 7];
-    }
-    r[128] = a[128] - b[128];
-    r[129] = a[129] - b[129];
-    r[130] = a[130] - b[130];
-    r[131] = a[131] - b[131];
-    r[132] = a[132] - b[132];
-    r[133] = a[133] - b[133];
-
-    return 0;
 }
 
 /* Multiply a and b into r. (r = a * b)
@@ -4842,22 +5061,53 @@ SP_NOINLINE static int sp_3072_sub_134(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_mul_134(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_106(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    sp_digit* z0 = r;
-    sp_digit z1[134];
-    sp_digit* a1 = z1;
-    sp_digit b1[67];
-    sp_digit* z2 = r + 134;
-    (void)sp_3072_add_67(a1, a, &a[67]);
-    (void)sp_3072_add_67(b1, b, &b[67]);
-    sp_3072_mul_67(z2, &a[67], &b[67]);
-    sp_3072_mul_67(z0, a, b);
-    sp_3072_mul_67(z1, a1, b1);
-    (void)sp_3072_sub_134(z1, z1, z2);
-    (void)sp_3072_sub_134(z1, z1, z0);
-    (void)sp_3072_add_134(r + 67, r + 67, z1);
+    int i;
+    int imax;
+    int k;
+    sp_uint64 c;
+    sp_uint64 lo;
+
+    c = ((sp_uint64)a[105]) * b[105];
+    r[211] = (sp_digit)(c >> 29);
+    c &= 0x1fffffff;
+    for (k = 209; k >= 0; k--) {
+        if (k >= 106) {
+            i = k - 105;
+            imax = 105;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        if (imax - i > 15) {
+            int imaxlo;
+            lo = 0;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
+                for (; i <= imax && i < imaxlo + 15; i++) {
+                    lo += ((sp_uint64)a[i]) * b[k - i];
+                }
+                c += lo >> 29;
+                lo &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
+        }
+        else {
+            lo = 0;
+            for (; i <= imax; i++) {
+                lo += ((sp_uint64)a[i]) * b[k - i];
+            }
+            c += lo >> 29;
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
+        }
+    }
+    r[0] = (sp_digit)c;
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -4865,284 +5115,65 @@ SP_NOINLINE static void sp_3072_mul_134(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_sqr_134(sp_digit* r, const sp_digit* a)
-{
-    sp_digit* z0 = r;
-    sp_digit z1[134];
-    sp_digit* a1 = z1;
-    sp_digit* z2 = r + 134;
-    (void)sp_3072_add_67(a1, a, &a[67]);
-    sp_3072_sqr_67(z2, &a[67]);
-    sp_3072_sqr_67(z0, a);
-    sp_3072_sqr_67(z1, a1);
-    (void)sp_3072_sub_134(z1, z1, z2);
-    (void)sp_3072_sub_134(z1, z1, z0);
-    (void)sp_3072_add_134(r + 67, r + 67, z1);
-}
-
-#endif /* !WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_add_134(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+SP_NOINLINE static void sp_3072_sqr_106(sp_digit* r, const sp_digit* a)
 {
     int i;
-
-    for (i = 0; i < 134; i++) {
-        r[i] = a[i] + b[i];
-    }
-
-    return 0;
-}
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_sub_134(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 134; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_mul_134(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 t;
 
-    c = ((int64_t)a[133]) * b[133];
-    r[267] = (sp_digit)(c >> 23);
-    c = (c & 0x7fffff) << 23;
-    for (k = 265; k >= 0; k--) {
-        for (i = 133; i >= 0; i--) {
-            j = k - i;
-            if (j >= 134) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
+    c = ((sp_uint64)a[105]) * a[105];
+    r[211] = (sp_digit)(c >> 29);
+    c = (c & 0x1fffffff) << 29;
+    for (k = 209; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
         }
-        r[k + 2] += (sp_digit)(c >> 46);
-        r[k + 1] = (sp_digit)((c >> 23) & 0x7fffff);
-        c = (c & 0x7fffff) << 23;
-    }
-    r[0] = (sp_digit)(c >> 23);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_sqr_134(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int64_t c;
-
-    c = ((int64_t)a[133]) * a[133];
-    r[267] = (sp_digit)(c >> 23);
-    c = (c & 0x7fffff) << 23;
-    for (k = 265; k >= 0; k--) {
-        for (i = 133; i >= 0; i--) {
-            j = k - i;
-            if (j >= 134 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * a[j] * 2;
+        if (k < 105) {
+            imax = k;
         }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
+        else {
+            imax = 105;
         }
+        if (imax - i >= 14) {
+            int imaxlo;
+            sp_uint64 hi;
 
-        r[k + 2] += (sp_digit)(c >> 46);
-        r[k + 1] = (sp_digit)((c >> 23) & 0x7fffff);
-        c = (c & 0x7fffff) << 23;
-    }
-    r[0] = (sp_digit)(c >> 23);
-}
+            hi = c >> 29;
+            c &= 0x1fffffff;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
+                t = 0;
+                for (; i <= imax && i < imaxlo + 14; i++) {
+                    t += ((sp_uint64)a[i]) * a[k - i];
+                }
+                c += t * 2;
 
-#endif /* WOLFSSL_SP_SMALL */
-#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_add_67(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 67; i++) {
-        r[i] = a[i] + b[i];
-    }
-
-    return 0;
-}
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_sub_67(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 67; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_sub_67(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 64; i += 8) {
-        r[i + 0] = a[i + 0] - b[i + 0];
-        r[i + 1] = a[i + 1] - b[i + 1];
-        r[i + 2] = a[i + 2] - b[i + 2];
-        r[i + 3] = a[i + 3] - b[i + 3];
-        r[i + 4] = a[i + 4] - b[i + 4];
-        r[i + 5] = a[i + 5] - b[i + 5];
-        r[i + 6] = a[i + 6] - b[i + 6];
-        r[i + 7] = a[i + 7] - b[i + 7];
-    }
-    r[64] = a[64] - b[64];
-    r[65] = a[65] - b[65];
-    r[66] = a[66] - b[66];
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_mul_67(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int k;
-    int64_t c;
-
-    c = ((int64_t)a[66]) * b[66];
-    r[133] = (sp_digit)(c >> 23);
-    c = (c & 0x7fffff) << 23;
-    for (k = 131; k >= 0; k--) {
-        for (i = 66; i >= 0; i--) {
-            j = k - i;
-            if (j >= 67) {
-                break;
+                hi += c >> 29;
+                c &= 0x1fffffff;
             }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
+            r[k + 2] += (sp_digit)(hi >> 29);
+            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
+            c <<= 29;
         }
-        r[k + 2] += (sp_digit)(c >> 46);
-        r[k + 1] = (sp_digit)((c >> 23) & 0x7fffff);
-        c = (c & 0x7fffff) << 23;
-    }
-    r[0] = (sp_digit)(c >> 23);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_sqr_67(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int64_t c;
-
-    c = ((int64_t)a[66]) * a[66];
-    r[133] = (sp_digit)(c >> 23);
-    c = (c & 0x7fffff) << 23;
-    for (k = 131; k >= 0; k--) {
-        for (i = 66; i >= 0; i--) {
-            j = k - i;
-            if (j >= 67 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
+        else
+        {
+            t = 0;
+            for (; i <= imax; i++) {
+                t += ((sp_uint64)a[i]) * a[k - i];
             }
+            c += t * 2;
 
-            c += ((int64_t)a[i]) * a[j] * 2;
+            r[k + 2] += (sp_digit) (c >> 58);
+            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
+            c = (c & 0x1fffffff) << 29;
         }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 46);
-        r[k + 1] = (sp_digit)((c >> 23) & 0x7fffff);
-        c = (c & 0x7fffff) << 23;
     }
-    r[0] = (sp_digit)(c >> 23);
+    r[0] = (sp_digit)(c >> 29);
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
-
 /* Caclulate the bottom digit of -1/a mod 2^n.
  *
  * a    A single precision number.
@@ -5158,10 +5189,10 @@ static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
     x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
     x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
     x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
-    x &= 0x7fffff;
+    x &= 0x1fffffff;
 
     /* rho = -1/m mod b */
-    *rho = ((sp_digit)1 << 23) - x;
+    *rho = ((sp_digit)1 << 29) - x;
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -5170,95 +5201,58 @@ static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_3072_mul_d_134(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_d_106(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 134; i++) {
+    for (i = 0; i < 106; i++) {
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
     }
-    r[134] = (sp_digit)t;
-#else
-    int64_t tb = b;
-    int64_t t = 0;
-    sp_digit t2;
-    int64_t p[4];
-    int i;
-
-    for (i = 0; i < 132; i += 4) {
-        p[0] = tb * a[i + 0];
-        p[1] = tb * a[i + 1];
-        p[2] = tb * a[i + 2];
-        p[3] = tb * a[i + 3];
-        t += p[0];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 0] = (sp_digit)t2;
-        t += p[1];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 1] = (sp_digit)t2;
-        t += p[2];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 2] = (sp_digit)t2;
-        t += p[3];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 3] = (sp_digit)t2;
-    }
-    t += tb * a[132];
-    r[132] = (sp_digit)(t & 0x7fffff);
-    t >>= 23;
-    t += tb * a[133];
-    r[133] = (sp_digit)(t & 0x7fffff);
-    t >>= 23;
-    r[134] = (sp_digit)(t & 0x7fffff);
-#endif /* WOLFSSL_SP_SMALL */
+    r[106] = (sp_digit)t;
 }
 
 #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_53(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 53; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
 /* r = 2^n mod m where n is the number of bits to reduce by.
  * Given m must be 3072 bits, just need to subtract.
  *
  * r  A single precision number.
  * m  A single precision number.
  */
-static void sp_3072_mont_norm_67(sp_digit* r, const sp_digit* m)
+static void sp_3072_mont_norm_53(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=0; i<66; i++) {
-        r[i] = 0x7fffff;
+    for (i=0; i<52; i++) {
+        r[i] = 0x1fffffff;
     }
-#else
-    int i;
-
-    for (i = 0; i < 64; i += 8) {
-        r[i + 0] = 0x7fffff;
-        r[i + 1] = 0x7fffff;
-        r[i + 2] = 0x7fffff;
-        r[i + 3] = 0x7fffff;
-        r[i + 4] = 0x7fffff;
-        r[i + 5] = 0x7fffff;
-        r[i + 6] = 0x7fffff;
-        r[i + 7] = 0x7fffff;
-    }
-    r[64] = 0x7fffff;
-    r[65] = 0x7fffff;
-#endif
-    r[66] = 0x3ffffL;
+    r[52] = 0xfffffffL;
 
     /* r = (2^n - 1) mod n */
-    (void)sp_3072_sub_67(r, r, m);
+    (void)sp_3072_sub_53(r, r, m);
 
     /* Add one so r = 2^n mod m */
     r[0] += 1;
@@ -5271,32 +5265,14 @@ static void sp_3072_mont_norm_67(sp_digit* r, const sp_digit* m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static sp_digit sp_3072_cmp_67(const sp_digit* a, const sp_digit* b)
+static sp_digit sp_3072_cmp_53(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=66; i>=0; i--) {
+    for (i=52; i>=0; i--) {
         r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#else
-    int i;
-
-    r |= (a[66] - b[66]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[65] - b[65]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[64] - b[64]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    for (i = 56; i >= 0; i -= 8) {
-        r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 4] - b[i + 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 3] - b[i + 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 2] - b[i + 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -5309,32 +5285,14 @@ static sp_digit sp_3072_cmp_67(const sp_digit* a, const sp_digit* b)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static void sp_3072_cond_sub_67(sp_digit* r, const sp_digit* a,
+static void sp_3072_cond_sub_53(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 67; i++) {
+    for (i = 0; i < 53; i++) {
         r[i] = a[i] - (b[i] & m);
     }
-#else
-    int i;
-
-    for (i = 0; i < 64; i += 8) {
-        r[i + 0] = a[i + 0] - (b[i + 0] & m);
-        r[i + 1] = a[i + 1] - (b[i + 1] & m);
-        r[i + 2] = a[i + 2] - (b[i + 2] & m);
-        r[i + 3] = a[i + 3] - (b[i + 3] & m);
-        r[i + 4] = a[i + 4] - (b[i + 4] & m);
-        r[i + 5] = a[i + 5] - (b[i + 5] & m);
-        r[i + 6] = a[i + 6] - (b[i + 6] & m);
-        r[i + 7] = a[i + 7] - (b[i + 7] & m);
-    }
-    r[64] = a[64] - (b[64] & m);
-    r[65] = a[65] - (b[65] & m);
-    r[66] = a[66] - (b[66] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -5343,79 +5301,45 @@ static void sp_3072_cond_sub_67(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_3072_mul_add_67(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_add_53(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 67; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x7fffff;
-        t >>= 23;
+    for (i = 0; i < 53; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0x1fffffff;
+        t >>= 29;
     }
-    r[67] += (sp_digit)t;
+    r[53] += (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t[8];
+    sp_int64 tb = b;
+    sp_int64 t[4];
     int i;
 
-    t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x7fffff);
-    for (i = 0; i < 64; i += 8) {
-        t[1] = tb * a[i+1];
-        r[i+1] += (sp_digit)((t[0] >> 23) + (t[1] & 0x7fffff));
-        t[2] = tb * a[i+2];
-        r[i+2] += (sp_digit)((t[1] >> 23) + (t[2] & 0x7fffff));
-        t[3] = tb * a[i+3];
-        r[i+3] += (sp_digit)((t[2] >> 23) + (t[3] & 0x7fffff));
-        t[4] = tb * a[i+4];
-        r[i+4] += (sp_digit)((t[3] >> 23) + (t[4] & 0x7fffff));
-        t[5] = tb * a[i+5];
-        r[i+5] += (sp_digit)((t[4] >> 23) + (t[5] & 0x7fffff));
-        t[6] = tb * a[i+6];
-        r[i+6] += (sp_digit)((t[5] >> 23) + (t[6] & 0x7fffff));
-        t[7] = tb * a[i+7];
-        r[i+7] += (sp_digit)((t[6] >> 23) + (t[7] & 0x7fffff));
-        t[0] = tb * a[i+8];
-        r[i+8] += (sp_digit)((t[7] >> 23) + (t[0] & 0x7fffff));
+    t[0] = 0;
+    for (i = 0; i < 52; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[0]  = t[3] >> 29;
     }
-    t[1] = tb * a[65];
-    r[65] += (sp_digit)((t[0] >> 23) + (t[1] & 0x7fffff));
-    t[2] = tb * a[66];
-    r[66] += (sp_digit)((t[1] >> 23) + (t[2] & 0x7fffff));
-    r[67] +=  (sp_digit)(t[2] >> 23);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 23.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_3072_norm_67(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 66; i++) {
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
-    }
-#else
-    int i;
-    for (i = 0; i < 64; i += 8) {
-        a[i+1] += a[i+0] >> 23; a[i+0] &= 0x7fffff;
-        a[i+2] += a[i+1] >> 23; a[i+1] &= 0x7fffff;
-        a[i+3] += a[i+2] >> 23; a[i+2] &= 0x7fffff;
-        a[i+4] += a[i+3] >> 23; a[i+3] &= 0x7fffff;
-        a[i+5] += a[i+4] >> 23; a[i+4] &= 0x7fffff;
-        a[i+6] += a[i+5] >> 23; a[i+5] &= 0x7fffff;
-        a[i+7] += a[i+6] >> 23; a[i+6] &= 0x7fffff;
-        a[i+8] += a[i+7] >> 23; a[i+7] &= 0x7fffff;
-    }
-    a[64+1] += a[64] >> 23; a[64] &= 0x7fffff;
-    a[65+1] += a[65] >> 23; a[65] &= 0x7fffff;
-#endif
+    t[0] += (tb * a[52]) + r[52];
+    r[52] = t[0] & 0x1fffffff;
+    r[53] +=  (sp_digit)(t[0] >> 29);
+#endif /* !WOLFSSL_SP_LARGE_CODE */
 }
 
 /* Shift the result in the high 1536 bits down to the bottom.
@@ -5423,54 +5347,19 @@ static void sp_3072_norm_67(sp_digit* a)
  * r  A single precision number.
  * a  A single precision number.
  */
-static void sp_3072_mont_shift_67(sp_digit* r, const sp_digit* a)
+static void sp_3072_mont_shift_53(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
-    sp_digit n;
-    sp_digit s;
+    sp_int64 n = a[52] >> 28;
+    n += ((sp_int64)a[53]) << 1;
 
-    s = a[67];
-    n = a[66] >> 18;
-    for (i = 0; i < 66; i++) {
-        n += (s & 0x7fffff) << 5;
-        r[i] = n & 0x7fffff;
-        n >>= 23;
-        s = a[68 + i] + (s >> 23);
+    for (i = 0; i < 52; i++) {
+        r[i] = n & 0x1fffffff;
+        n >>= 29;
+        n += ((sp_int64)a[54 + i]) << 1;
     }
-    n += s << 5;
-    r[66] = n;
-#else
-    sp_digit n;
-    sp_digit s;
-    int i;
-
-    s = a[67]; n = a[66] >> 18;
-    for (i = 0; i < 64; i += 8) {
-        n += (s & 0x7fffff) << 5; r[i+0] = n & 0x7fffff;
-        n >>= 23; s = a[i+68] + (s >> 23);
-        n += (s & 0x7fffff) << 5; r[i+1] = n & 0x7fffff;
-        n >>= 23; s = a[i+69] + (s >> 23);
-        n += (s & 0x7fffff) << 5; r[i+2] = n & 0x7fffff;
-        n >>= 23; s = a[i+70] + (s >> 23);
-        n += (s & 0x7fffff) << 5; r[i+3] = n & 0x7fffff;
-        n >>= 23; s = a[i+71] + (s >> 23);
-        n += (s & 0x7fffff) << 5; r[i+4] = n & 0x7fffff;
-        n >>= 23; s = a[i+72] + (s >> 23);
-        n += (s & 0x7fffff) << 5; r[i+5] = n & 0x7fffff;
-        n >>= 23; s = a[i+73] + (s >> 23);
-        n += (s & 0x7fffff) << 5; r[i+6] = n & 0x7fffff;
-        n >>= 23; s = a[i+74] + (s >> 23);
-        n += (s & 0x7fffff) << 5; r[i+7] = n & 0x7fffff;
-        n >>= 23; s = a[i+75] + (s >> 23);
-    }
-    n += (s & 0x7fffff) << 5; r[64] = n & 0x7fffff;
-    n >>= 23; s = a[132] + (s >> 23);
-    n += (s & 0x7fffff) << 5; r[65] = n & 0x7fffff;
-    n >>= 23; s = a[133] + (s >> 23);
-    n += s << 5;              r[66] = n;
-#endif /* WOLFSSL_SP_SMALL */
-    XMEMSET(&r[67], 0, sizeof(*r) * 67U);
+    r[52] = (sp_digit)n;
+    XMEMSET(&r[53], 0, sizeof(*r) * 53U);
 }
 
 /* Reduce the number back to 3072 bits using Montgomery reduction.
@@ -5479,921 +5368,81 @@ static void sp_3072_mont_shift_67(sp_digit* r, const sp_digit* a)
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static void sp_3072_mont_reduce_67(sp_digit* a, const sp_digit* m, sp_digit mp)
+static void sp_3072_mont_reduce_53(sp_digit* a, const sp_digit* m, sp_digit mp)
 {
     int i;
     sp_digit mu;
 
-    sp_3072_norm_67(a + 67);
+    sp_3072_norm_53(a + 53);
 
-    for (i=0; i<66; i++) {
-        mu = (a[i] * mp) & 0x7fffff;
-        sp_3072_mul_add_67(a+i, m, mu);
-        a[i+1] += a[i] >> 23;
+    for (i=0; i<52; i++) {
+        mu = (a[i] * mp) & 0x1fffffff;
+        sp_3072_mul_add_53(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
     }
-    mu = (a[i] * mp) & 0x3ffffL;
-    sp_3072_mul_add_67(a+i, m, mu);
-    a[i+1] += a[i] >> 23;
-    a[i] &= 0x7fffff;
-    sp_3072_mont_shift_67(a, a);
-    sp_3072_cond_sub_67(a, a, m, 0 - (((a[66] >> 18) > 0) ?
+    mu = (a[i] * mp) & 0xfffffffL;
+    sp_3072_mul_add_53(a+i, m, mu);
+    a[i+1] += a[i] >> 29;
+    a[i] &= 0x1fffffff;
+    sp_3072_mont_shift_53(a, a);
+    sp_3072_cond_sub_53(a, a, m, 0 - (((a[52] - m[52]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
-    sp_3072_norm_67(a);
+    sp_3072_norm_53(a);
 }
 
-/* Multiply two Montogmery form numbers mod the modulus (prime).
- * (r = a * b mod m)
- *
- * r   Result of multiplication.
- * a   First number to multiply in Montogmery form.
- * b   Second number to multiply in Montogmery form.
- * m   Modulus (prime).
- * mp  Montogmery mulitplier.
- */
-static void sp_3072_mont_mul_67(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m, sp_digit mp)
-{
-    sp_3072_mul_67(r, a, b);
-    sp_3072_mont_reduce_67(r, m, mp);
-}
-
-/* Square the Montgomery form number. (r = a * a mod m)
- *
- * r   Result of squaring.
- * a   Number to square in Montogmery form.
- * m   Modulus (prime).
- * mp  Montogmery mulitplier.
- */
-static void sp_3072_mont_sqr_67(sp_digit* r, const sp_digit* a,
-        const sp_digit* m, sp_digit mp)
-{
-    sp_3072_sqr_67(r, a);
-    sp_3072_mont_reduce_67(r, m, mp);
-}
-
-/* Multiply a by scalar b into r. (r = a * b)
+/* Multiply a and b into r. (r = a * b)
  *
  * r  A single precision integer.
  * a  A single precision integer.
- * b  A scalar.
- */
-SP_NOINLINE static void sp_3072_mul_d_67(sp_digit* r, const sp_digit* a,
-    sp_digit b)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
-    int i;
-
-    for (i = 0; i < 67; i++) {
-        t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-    }
-    r[67] = (sp_digit)t;
-#else
-    int64_t tb = b;
-    int64_t t = 0;
-    sp_digit t2;
-    int64_t p[4];
-    int i;
-
-    for (i = 0; i < 64; i += 4) {
-        p[0] = tb * a[i + 0];
-        p[1] = tb * a[i + 1];
-        p[2] = tb * a[i + 2];
-        p[3] = tb * a[i + 3];
-        t += p[0];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 0] = (sp_digit)t2;
-        t += p[1];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 1] = (sp_digit)t2;
-        t += p[2];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 2] = (sp_digit)t2;
-        t += p[3];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 3] = (sp_digit)t2;
-    }
-    t += tb * a[64];
-    r[64] = (sp_digit)(t & 0x7fffff);
-    t >>= 23;
-    t += tb * a[65];
-    r[65] = (sp_digit)(t & 0x7fffff);
-    t >>= 23;
-    t += tb * a[66];
-    r[66] = (sp_digit)(t & 0x7fffff);
-    t >>= 23;
-    r[67] = (sp_digit)(t & 0x7fffff);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Conditionally add a and b using the mask m.
- * m is -1 to add and 0 when not.
- *
- * r  A single precision number representing conditional add result.
- * a  A single precision number to add with.
- * b  A single precision number to add.
- * m  Mask value to apply.
- */
-static void sp_3072_cond_add_67(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit m)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 67; i++) {
-        r[i] = a[i] + (b[i] & m);
-    }
-#else
-    int i;
-
-    for (i = 0; i < 64; i += 8) {
-        r[i + 0] = a[i + 0] + (b[i + 0] & m);
-        r[i + 1] = a[i + 1] + (b[i + 1] & m);
-        r[i + 2] = a[i + 2] + (b[i + 2] & m);
-        r[i + 3] = a[i + 3] + (b[i + 3] & m);
-        r[i + 4] = a[i + 4] + (b[i + 4] & m);
-        r[i + 5] = a[i + 5] + (b[i + 5] & m);
-        r[i + 6] = a[i + 6] + (b[i + 6] & m);
-        r[i + 7] = a[i + 7] + (b[i + 7] & m);
-    }
-    r[64] = a[64] + (b[64] & m);
-    r[65] = a[65] + (b[65] & m);
-    r[66] = a[66] + (b[66] & m);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-#ifdef WOLFSSL_SP_DIV_32
-static WC_INLINE sp_digit sp_3072_div_word_67(sp_digit d1, sp_digit d0,
-    sp_digit dv)
-{
-    sp_digit d;
-    sp_digit r;
-    sp_digit t;
-
-    /* All 23 bits from d1 and top 8 bits from d0. */
-    d = (d1 << 8) + (d0 >> 15);
-    r = d / dv;
-    d -= r * dv;
-    /* Up to 9 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 7) & ((1 << 8) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 17 bits in r */
-    /* Remaining 7 bits from d0. */
-    r <<= 7;
-    d <<= 7;
-    d += d0 & ((1 << 7) - 1);
-    t = d / dv;
-    r += t;
-
-    /* All 23 bits from d1 and top 8 bits from d0. */
-    return r;
-}
-#endif /* WOLFSSL_SP_DIV_32 */
-
-/* Divide d in a and put remainder into r (m*d + r = a)
- * m is not calculated as it is not needed at this time.
- *
- * Large number of bits in last word.
- *
- * a  Number to be divided.
- * d  Number to divide with.
- * m  Multiplier result.
- * r  Remainder from the division.
- * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
- */
-static int sp_3072_div_67(const sp_digit* a, const sp_digit* d, 
-        const sp_digit* m, sp_digit* r)
-{
-    int i;
-#ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
-#endif
-    sp_digit dv;
-    sp_digit r1;
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* t1 = NULL;
-#else
-    sp_digit t1[3 * 67 + 1];
-#endif
-    sp_digit* t2 = NULL;
-    int err = MP_OKAY;
-
-    (void)m;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 67 + 1), NULL,
-                                                       DYNAMIC_TYPE_TMP_BUFFER);
-    if (t1 == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        t2 = t1 + 2 * 67;
-
-        dv = d[66];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 67U);
-        for (i=66; i>=0; i--) {
-            t1[67 + i] += t1[67 + i - 1] >> 23;
-            t1[67 + i - 1] &= 0x7fffff;
-#ifndef WOLFSSL_SP_DIV_32
-            d1 = t1[67 + i];
-            d1 <<= 23;
-            d1 += t1[67 + i - 1];
-            r1 = (sp_digit)(d1 / dv);
-#else
-            r1 = sp_3072_div_word_67(t1[67 + i], t1[67 + i - 1], dv);
-#endif
-
-            sp_3072_mul_d_67(t2, d, r1);
-            (void)sp_3072_sub_67(&t1[i], &t1[i], t2);
-            sp_3072_norm_67(&t1[i]);
-            t1[67 + i] -= t2[67];
-            t1[67 + i] += t1[67 + i - 1] >> 23;
-            t1[67 + i - 1] &= 0x7fffff;
-            r1 = (((-t1[67 + i]) << 23) - t1[67 + i - 1]) / dv;
-            r1++;
-            sp_3072_mul_d_67(t2, d, r1);
-            (void)sp_3072_add_67(&t1[i], &t1[i], t2);
-            t1[67 + i] += t1[67 + i - 1] >> 23;
-            t1[67 + i - 1] &= 0x7fffff;
-        }
-        t1[67 - 1] += t1[67 - 2] >> 23;
-        t1[67 - 2] &= 0x7fffff;
-        r1 = t1[67 - 1] / dv;
-
-        sp_3072_mul_d_67(t2, d, r1);
-        (void)sp_3072_sub_67(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 134U);
-        for (i=0; i<66; i++) {
-            r[i+1] += r[i] >> 23;
-            r[i] &= 0x7fffff;
-        }
-        sp_3072_cond_add_67(r, r, d, 0 - ((r[66] < 0) ?
-                    (sp_digit)1 : (sp_digit)0));
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (t1 != NULL)
-        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-
-/* Reduce a modulo m into r. (r = a mod m)
- *
- * r  A single precision number that is the reduced result.
- * a  A single precision number that is to be reduced.
- * m  A single precision number that is the modulus to reduce with.
- * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
- */
-static int sp_3072_mod_67(sp_digit* r, const sp_digit* a, const sp_digit* m)
-{
-    return sp_3072_div_67(a, m, NULL, r);
-}
-
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_3072_mod_exp_67(sp_digit* r, const sp_digit* a, const sp_digit* e,
-    int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[3 * 134];
-#endif
-    sp_digit* t[3] = {0, 0, 0};
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 67 * 2, NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<3; i++) {
-            t[i] = td + (i * 67 * 2);
-            XMEMSET(t[i], 0, sizeof(sp_digit) * 67U * 2U);
-        }
-
-        sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_67(norm, m);
-
-        if (reduceA != 0) {
-            err = sp_3072_mod_67(t[1], a, m);
-        }
-        else {
-            XMEMCPY(t[1], a, sizeof(sp_digit) * 67U);
-        }
-    }
-    if (err == MP_OKAY) {
-        sp_3072_mul_67(t[1], t[1], norm);
-        err = sp_3072_mod_67(t[1], t[1], m);
-    }
-
-    if (err == MP_OKAY) {
-        i = bits / 23;
-        c = bits % 23;
-        n = e[i--] << (23 - c);
-        for (; ; c--) {
-            if (c == 0) {
-                if (i == -1) {
-                    break;
-                }
-
-                n = e[i--];
-                c = 23;
-            }
-
-            y = (int)((n >> 22) & 1);
-            n <<= 1;
-
-            sp_3072_mont_mul_67(t[y^1], t[0], t[1], m, mp);
-
-            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
-                                  ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 67 * 2);
-            sp_3072_mont_sqr_67(t[2], t[2], m, mp);
-            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
-                            ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 67 * 2);
-        }
-
-        sp_3072_mont_reduce_67(t[0], m, mp);
-        n = sp_3072_cmp_67(t[0], m);
-        sp_3072_cond_sub_67(t[0], t[0], m, ((n < 0) ?
-                    (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 67 * 2);
-
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-#elif !defined(WC_NO_CACHE_RESISTANT)
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[3 * 134];
-#endif
-    sp_digit* t[3] = {0, 0, 0};
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 67 * 2, NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<3; i++) {
-            t[i] = td + (i * 67 * 2);
-        }
-
-        sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_67(norm, m);
-
-        if (reduceA != 0) {
-            err = sp_3072_mod_67(t[1], a, m);
-            if (err == MP_OKAY) {
-                sp_3072_mul_67(t[1], t[1], norm);
-                err = sp_3072_mod_67(t[1], t[1], m);
-            }
-        }
-        else {
-            sp_3072_mul_67(t[1], a, norm);
-            err = sp_3072_mod_67(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        i = bits / 23;
-        c = bits % 23;
-        n = e[i--] << (23 - c);
-        for (; ; c--) {
-            if (c == 0) {
-                if (i == -1) {
-                    break;
-                }
-
-                n = e[i--];
-                c = 23;
-            }
-
-            y = (int)((n >> 22) & 1);
-            n <<= 1;
-
-            sp_3072_mont_mul_67(t[y^1], t[0], t[1], m, mp);
-
-            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
-                                  ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 67 * 2);
-            sp_3072_mont_sqr_67(t[2], t[2], m, mp);
-            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
-                            ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 67 * 2);
-        }
-
-        sp_3072_mont_reduce_67(t[0], m, mp);
-        n = sp_3072_cmp_67(t[0], m);
-        sp_3072_cond_sub_67(t[0], t[0], m, ((n < 0) ?
-                    (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 67 * 2);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-#else
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[(32 * 134) + 134];
-#endif
-    sp_digit* t[32];
-    sp_digit* rt = NULL;
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 134) + 134), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++)
-            t[i] = td + i * 134;
-        rt = td + 4288;
-
-        sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_67(norm, m);
-
-        if (reduceA != 0) {
-            err = sp_3072_mod_67(t[1], a, m);
-            if (err == MP_OKAY) {
-                sp_3072_mul_67(t[1], t[1], norm);
-                err = sp_3072_mod_67(t[1], t[1], m);
-            }
-        }
-        else {
-            sp_3072_mul_67(t[1], a, norm);
-            err = sp_3072_mod_67(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_3072_mont_sqr_67(t[ 2], t[ 1], m, mp);
-        sp_3072_mont_mul_67(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_3072_mont_sqr_67(t[ 4], t[ 2], m, mp);
-        sp_3072_mont_mul_67(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_3072_mont_sqr_67(t[ 6], t[ 3], m, mp);
-        sp_3072_mont_mul_67(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_3072_mont_sqr_67(t[ 8], t[ 4], m, mp);
-        sp_3072_mont_mul_67(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_3072_mont_sqr_67(t[10], t[ 5], m, mp);
-        sp_3072_mont_mul_67(t[11], t[ 6], t[ 5], m, mp);
-        sp_3072_mont_sqr_67(t[12], t[ 6], m, mp);
-        sp_3072_mont_mul_67(t[13], t[ 7], t[ 6], m, mp);
-        sp_3072_mont_sqr_67(t[14], t[ 7], m, mp);
-        sp_3072_mont_mul_67(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_67(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_67(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_67(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_67(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_67(t[20], t[10], m, mp);
-        sp_3072_mont_mul_67(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_67(t[22], t[11], m, mp);
-        sp_3072_mont_mul_67(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_67(t[24], t[12], m, mp);
-        sp_3072_mont_mul_67(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_67(t[26], t[13], m, mp);
-        sp_3072_mont_mul_67(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_67(t[28], t[14], m, mp);
-        sp_3072_mont_mul_67(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_67(t[30], t[15], m, mp);
-        sp_3072_mont_mul_67(t[31], t[16], t[15], m, mp);
-
-        bits = ((bits + 4) / 5) * 5;
-        i = ((bits + 22) / 23) - 1;
-        c = bits % 23;
-        if (c == 0) {
-            c = 23;
-        }
-        if (i < 67) {
-            n = e[i--] << (32 - c);
-        }
-        else {
-            n = 0;
-            i--;
-        }
-        if (c < 5) {
-            n |= e[i--] << (9 - c);
-            c += 23;
-        }
-        y = (int)((n >> 27) & 0x1f);
-        n <<= 5;
-        c -= 5;
-        XMEMCPY(rt, t[y], sizeof(sp_digit) * 134);
-        while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (9 - c);
-                c += 23;
-            }
-            y = (int)((n >> 27) & 0x1f);
-            n <<= 5;
-            c -= 5;
-
-            sp_3072_mont_sqr_67(rt, rt, m, mp);
-            sp_3072_mont_sqr_67(rt, rt, m, mp);
-            sp_3072_mont_sqr_67(rt, rt, m, mp);
-            sp_3072_mont_sqr_67(rt, rt, m, mp);
-            sp_3072_mont_sqr_67(rt, rt, m, mp);
-
-            sp_3072_mont_mul_67(rt, rt, t[y], m, mp);
-        }
-
-        sp_3072_mont_reduce_67(rt, m, mp);
-        n = sp_3072_cmp_67(rt, m);
-        sp_3072_cond_sub_67(rt, rt, m, ((n < 0) ?
-                   (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, rt, sizeof(sp_digit) * 134);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-#endif
-}
-
-#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
-
-/* r = 2^n mod m where n is the number of bits to reduce by.
- * Given m must be 3072 bits, just need to subtract.
- *
- * r  A single precision number.
- * m  A single precision number.
- */
-static void sp_3072_mont_norm_134(sp_digit* r, const sp_digit* m)
-{
-    /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=0; i<133; i++) {
-        r[i] = 0x7fffff;
-    }
-#else
-    int i;
-
-    for (i = 0; i < 128; i += 8) {
-        r[i + 0] = 0x7fffff;
-        r[i + 1] = 0x7fffff;
-        r[i + 2] = 0x7fffff;
-        r[i + 3] = 0x7fffff;
-        r[i + 4] = 0x7fffff;
-        r[i + 5] = 0x7fffff;
-        r[i + 6] = 0x7fffff;
-        r[i + 7] = 0x7fffff;
-    }
-    r[128] = 0x7fffff;
-    r[129] = 0x7fffff;
-    r[130] = 0x7fffff;
-    r[131] = 0x7fffff;
-    r[132] = 0x7fffff;
-#endif
-    r[133] = 0x1fffL;
-
-    /* r = (2^n - 1) mod n */
-    (void)sp_3072_sub_134(r, r, m);
-
-    /* Add one so r = 2^n mod m */
-    r[0] += 1;
-}
-
-/* Compare a with b in constant time.
- *
- * a  A single precision integer.
  * b  A single precision integer.
- * return -ve, 0 or +ve if a is less than, equal to or greater than b
- * respectively.
  */
-static sp_digit sp_3072_cmp_134(const sp_digit* a, const sp_digit* b)
-{
-    sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=133; i>=0; i--) {
-        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#else
-    int i;
-
-    r |= (a[133] - b[133]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[132] - b[132]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[131] - b[131]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[130] - b[130]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[129] - b[129]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[128] - b[128]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    for (i = 120; i >= 0; i -= 8) {
-        r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 4] - b[i + 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 3] - b[i + 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 2] - b[i + 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#endif /* WOLFSSL_SP_SMALL */
-
-    return r;
-}
-
-/* Conditionally subtract b from a using the mask m.
- * m is -1 to subtract and 0 when not.
- *
- * r  A single precision number representing condition subtract result.
- * a  A single precision number to subtract from.
- * b  A single precision number to subtract.
- * m  Mask value to apply.
- */
-static void sp_3072_cond_sub_134(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit m)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 134; i++) {
-        r[i] = a[i] - (b[i] & m);
-    }
-#else
-    int i;
-
-    for (i = 0; i < 128; i += 8) {
-        r[i + 0] = a[i + 0] - (b[i + 0] & m);
-        r[i + 1] = a[i + 1] - (b[i + 1] & m);
-        r[i + 2] = a[i + 2] - (b[i + 2] & m);
-        r[i + 3] = a[i + 3] - (b[i + 3] & m);
-        r[i + 4] = a[i + 4] - (b[i + 4] & m);
-        r[i + 5] = a[i + 5] - (b[i + 5] & m);
-        r[i + 6] = a[i + 6] - (b[i + 6] & m);
-        r[i + 7] = a[i + 7] - (b[i + 7] & m);
-    }
-    r[128] = a[128] - (b[128] & m);
-    r[129] = a[129] - (b[129] & m);
-    r[130] = a[130] - (b[130] & m);
-    r[131] = a[131] - (b[131] & m);
-    r[132] = a[132] - (b[132] & m);
-    r[133] = a[133] - (b[133] & m);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Mul a by scalar b and add into r. (r += a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A scalar.
- */
-SP_NOINLINE static void sp_3072_mul_add_134(sp_digit* r, const sp_digit* a,
-        const sp_digit b)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
-    int i;
-
-    for (i = 0; i < 134; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x7fffff;
-        t >>= 23;
-    }
-    r[134] += (sp_digit)t;
-#else
-    int64_t tb = b;
-    int64_t t[8];
-    int i;
-
-    t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x7fffff);
-    for (i = 0; i < 128; i += 8) {
-        t[1] = tb * a[i+1];
-        r[i+1] += (sp_digit)((t[0] >> 23) + (t[1] & 0x7fffff));
-        t[2] = tb * a[i+2];
-        r[i+2] += (sp_digit)((t[1] >> 23) + (t[2] & 0x7fffff));
-        t[3] = tb * a[i+3];
-        r[i+3] += (sp_digit)((t[2] >> 23) + (t[3] & 0x7fffff));
-        t[4] = tb * a[i+4];
-        r[i+4] += (sp_digit)((t[3] >> 23) + (t[4] & 0x7fffff));
-        t[5] = tb * a[i+5];
-        r[i+5] += (sp_digit)((t[4] >> 23) + (t[5] & 0x7fffff));
-        t[6] = tb * a[i+6];
-        r[i+6] += (sp_digit)((t[5] >> 23) + (t[6] & 0x7fffff));
-        t[7] = tb * a[i+7];
-        r[i+7] += (sp_digit)((t[6] >> 23) + (t[7] & 0x7fffff));
-        t[0] = tb * a[i+8];
-        r[i+8] += (sp_digit)((t[7] >> 23) + (t[0] & 0x7fffff));
-    }
-    t[1] = tb * a[129];
-    r[129] += (sp_digit)((t[0] >> 23) + (t[1] & 0x7fffff));
-    t[2] = tb * a[130];
-    r[130] += (sp_digit)((t[1] >> 23) + (t[2] & 0x7fffff));
-    t[3] = tb * a[131];
-    r[131] += (sp_digit)((t[2] >> 23) + (t[3] & 0x7fffff));
-    t[4] = tb * a[132];
-    r[132] += (sp_digit)((t[3] >> 23) + (t[4] & 0x7fffff));
-    t[5] = tb * a[133];
-    r[133] += (sp_digit)((t[4] >> 23) + (t[5] & 0x7fffff));
-    r[134] +=  (sp_digit)(t[5] >> 23);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 23.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_3072_norm_134(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 133; i++) {
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
-    }
-#else
-    int i;
-    for (i = 0; i < 128; i += 8) {
-        a[i+1] += a[i+0] >> 23; a[i+0] &= 0x7fffff;
-        a[i+2] += a[i+1] >> 23; a[i+1] &= 0x7fffff;
-        a[i+3] += a[i+2] >> 23; a[i+2] &= 0x7fffff;
-        a[i+4] += a[i+3] >> 23; a[i+3] &= 0x7fffff;
-        a[i+5] += a[i+4] >> 23; a[i+4] &= 0x7fffff;
-        a[i+6] += a[i+5] >> 23; a[i+5] &= 0x7fffff;
-        a[i+7] += a[i+6] >> 23; a[i+6] &= 0x7fffff;
-        a[i+8] += a[i+7] >> 23; a[i+7] &= 0x7fffff;
-    }
-    a[128+1] += a[128] >> 23; a[128] &= 0x7fffff;
-    a[129+1] += a[129] >> 23; a[129] &= 0x7fffff;
-    a[130+1] += a[130] >> 23; a[130] &= 0x7fffff;
-    a[131+1] += a[131] >> 23; a[131] &= 0x7fffff;
-    a[132+1] += a[132] >> 23; a[132] &= 0x7fffff;
-#endif
-}
-
-/* Shift the result in the high 3072 bits down to the bottom.
- *
- * r  A single precision number.
- * a  A single precision number.
- */
-static void sp_3072_mont_shift_134(sp_digit* r, const sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    int64_t n = a[133] >> 13;
-    n += ((int64_t)a[134]) << 10;
-
-    for (i = 0; i < 133; i++) {
-        r[i] = n & 0x7fffff;
-        n >>= 23;
-        n += ((int64_t)a[135 + i]) << 10;
-    }
-    r[133] = (sp_digit)n;
-#else
-    int i;
-    int64_t n = a[133] >> 13;
-    n += ((int64_t)a[134]) << 10;
-    for (i = 0; i < 128; i += 8) {
-        r[i + 0] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 135]) << 10;
-        r[i + 1] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 136]) << 10;
-        r[i + 2] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 137]) << 10;
-        r[i + 3] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 138]) << 10;
-        r[i + 4] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 139]) << 10;
-        r[i + 5] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 140]) << 10;
-        r[i + 6] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 141]) << 10;
-        r[i + 7] = n & 0x7fffff;
-        n >>= 23; n += ((int64_t)a[i + 142]) << 10;
-    }
-    r[128] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[263]) << 10;
-    r[129] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[264]) << 10;
-    r[130] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[265]) << 10;
-    r[131] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[266]) << 10;
-    r[132] = n & 0x7fffff; n >>= 23; n += ((int64_t)a[267]) << 10;
-    r[133] = (sp_digit)n;
-#endif /* WOLFSSL_SP_SMALL */
-    XMEMSET(&r[134], 0, sizeof(*r) * 134U);
-}
-
-/* Reduce the number back to 3072 bits using Montgomery reduction.
- *
- * a   A single precision number to reduce in place.
- * m   The single precision number representing the modulus.
- * mp  The digit representing the negative inverse of m mod 2^n.
- */
-static void sp_3072_mont_reduce_134(sp_digit* a, const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_3072_mul_53(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i;
-    sp_digit mu;
+    int imax;
+    int k;
+    sp_uint64 c;
+    sp_uint64 lo;
 
-    sp_3072_norm_134(a + 134);
-
-#ifdef WOLFSSL_SP_DH
-    if (mp != 1) {
-        for (i=0; i<133; i++) {
-            mu = (a[i] * mp) & 0x7fffff;
-            sp_3072_mul_add_134(a+i, m, mu);
-            a[i+1] += a[i] >> 23;
+    c = ((sp_uint64)a[52]) * b[52];
+    r[105] = (sp_digit)(c >> 29);
+    c &= 0x1fffffff;
+    for (k = 103; k >= 0; k--) {
+        if (k >= 53) {
+            i = k - 52;
+            imax = 52;
         }
-        mu = (a[i] * mp) & 0x1fffL;
-        sp_3072_mul_add_134(a+i, m, mu);
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
-    }
-    else {
-        for (i=0; i<133; i++) {
-            mu = a[i] & 0x7fffff;
-            sp_3072_mul_add_134(a+i, m, mu);
-            a[i+1] += a[i] >> 23;
+        else {
+            i = 0;
+            imax = k;
+        }
+        if (imax - i > 15) {
+            int imaxlo;
+            lo = 0;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
+                for (; i <= imax && i < imaxlo + 15; i++) {
+                    lo += ((sp_uint64)a[i]) * b[k - i];
+                }
+                c += lo >> 29;
+                lo &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
+        }
+        else {
+            lo = 0;
+            for (; i <= imax; i++) {
+                lo += ((sp_uint64)a[i]) * b[k - i];
+            }
+            c += lo >> 29;
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
         }
-        mu = a[i] & 0x1fffL;
-        sp_3072_mul_add_134(a+i, m, mu);
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
     }
-#else
-    for (i=0; i<133; i++) {
-        mu = (a[i] * mp) & 0x7fffff;
-        sp_3072_mul_add_134(a+i, m, mu);
-        a[i+1] += a[i] >> 23;
-    }
-    mu = (a[i] * mp) & 0x1fffL;
-    sp_3072_mul_add_134(a+i, m, mu);
-    a[i+1] += a[i] >> 23;
-    a[i] &= 0x7fffff;
-#endif
-    sp_3072_mont_shift_134(a, a);
-    sp_3072_cond_sub_134(a, a, m, 0 - (((a[133] >> 13) > 0) ?
-            (sp_digit)1 : (sp_digit)0));
-    sp_3072_norm_134(a);
+    r[0] = (sp_digit)c;
 }
 
 /* Multiply two Montogmery form numbers mod the modulus (prime).
@@ -6405,11 +5454,75 @@ static void sp_3072_mont_reduce_134(sp_digit* a, const sp_digit* m, sp_digit mp)
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_3072_mont_mul_134(sp_digit* r, const sp_digit* a,
+static void sp_3072_mont_mul_53(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit* m, sp_digit mp)
 {
-    sp_3072_mul_134(r, a, b);
-    sp_3072_mont_reduce_134(r, m, mp);
+    sp_3072_mul_53(r, a, b);
+    sp_3072_mont_reduce_53(r, m, mp);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_sqr_53(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint64 c;
+    sp_uint64 t;
+
+    c = ((sp_uint64)a[52]) * a[52];
+    r[105] = (sp_digit)(c >> 29);
+    c = (c & 0x1fffffff) << 29;
+    for (k = 103; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
+        }
+        if (k < 52) {
+            imax = k;
+        }
+        else {
+            imax = 52;
+        }
+        if (imax - i >= 14) {
+            int imaxlo;
+            sp_uint64 hi;
+
+            hi = c >> 29;
+            c &= 0x1fffffff;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
+                t = 0;
+                for (; i <= imax && i < imaxlo + 14; i++) {
+                    t += ((sp_uint64)a[i]) * a[k - i];
+                }
+                c += t * 2;
+
+                hi += c >> 29;
+                c &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(hi >> 29);
+            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
+            c <<= 29;
+        }
+        else
+        {
+            t = 0;
+            for (; i <= imax; i++) {
+                t += ((sp_uint64)a[i]) * a[k - i];
+            }
+            c += t * 2;
+
+            r[k + 2] += (sp_digit) (c >> 58);
+            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
+            c = (c & 0x1fffffff) << 29;
+        }
+    }
+    r[0] = (sp_digit)(c >> 29);
 }
 
 /* Square the Montgomery form number. (r = a * a mod m)
@@ -6419,11 +5532,11 @@ static void sp_3072_mont_mul_134(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_3072_mont_sqr_134(sp_digit* r, const sp_digit* a,
+static void sp_3072_mont_sqr_53(sp_digit* r, const sp_digit* a,
         const sp_digit* m, sp_digit mp)
 {
-    sp_3072_sqr_134(r, a);
-    sp_3072_mont_reduce_134(r, m, mp);
+    sp_3072_sqr_53(r, a);
+    sp_3072_mont_reduce_53(r, m, mp);
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -6432,51 +5545,19 @@ static void sp_3072_mont_sqr_134(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_3072_mul_d_268(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_d_53(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 268; i++) {
+    for (i = 0; i < 53; i++) {
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
     }
-    r[268] = (sp_digit)t;
-#else
-    int64_t tb = b;
-    int64_t t = 0;
-    sp_digit t2;
-    int64_t p[4];
-    int i;
-
-    for (i = 0; i < 268; i += 4) {
-        p[0] = tb * a[i + 0];
-        p[1] = tb * a[i + 1];
-        p[2] = tb * a[i + 2];
-        p[3] = tb * a[i + 3];
-        t += p[0];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 0] = (sp_digit)t2;
-        t += p[1];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 1] = (sp_digit)t2;
-        t += p[2];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 2] = (sp_digit)t2;
-        t += p[3];
-        t2 = (sp_digit)(t & 0x7fffff);
-        t >>= 23;
-        r[i + 3] = (sp_digit)t2;
-    }
-    r[268] = (sp_digit)(t & 0x7fffff);
-#endif /* WOLFSSL_SP_SMALL */
+    r[53] = (sp_digit)t;
 }
 
 /* Conditionally add a and b using the mask m.
@@ -6487,95 +5568,170 @@ SP_NOINLINE static void sp_3072_mul_d_268(sp_digit* r, const sp_digit* a,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static void sp_3072_cond_add_134(sp_digit* r, const sp_digit* a,
+static void sp_3072_cond_add_53(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 134; i++) {
+    for (i = 0; i < 53; i++) {
         r[i] = a[i] + (b[i] & m);
     }
-#else
-    int i;
-
-    for (i = 0; i < 128; i += 8) {
-        r[i + 0] = a[i + 0] + (b[i + 0] & m);
-        r[i + 1] = a[i + 1] + (b[i + 1] & m);
-        r[i + 2] = a[i + 2] + (b[i + 2] & m);
-        r[i + 3] = a[i + 3] + (b[i + 3] & m);
-        r[i + 4] = a[i + 4] + (b[i + 4] & m);
-        r[i + 5] = a[i + 5] + (b[i + 5] & m);
-        r[i + 6] = a[i + 6] + (b[i + 6] & m);
-        r[i + 7] = a[i + 7] + (b[i + 7] & m);
-    }
-    r[128] = a[128] + (b[128] & m);
-    r[129] = a[129] + (b[129] & m);
-    r[130] = a[130] + (b[130] & m);
-    r[131] = a[131] + (b[131] & m);
-    r[132] = a[132] + (b[132] & m);
-    r[133] = a[133] + (b[133] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
-SP_NOINLINE static void sp_3072_rshift_134(sp_digit* r, const sp_digit* a,
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_53(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 53; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_3072_rshift_53(sp_digit* r, const sp_digit* a,
         byte n)
 {
     int i;
 
-#ifdef WOLFSSL_SP_SMALL
-    for (i=0; i<133; i++) {
-        r[i] = ((a[i] >> n) | (a[i + 1] << (23 - n))) & 0x7fffff;
+    for (i=0; i<52; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
     }
-#else
-    for (i=0; i<128; i += 8) {
-        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (23 - n)) & 0x7fffff);
-        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (23 - n)) & 0x7fffff);
-        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (23 - n)) & 0x7fffff);
-        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (23 - n)) & 0x7fffff);
-        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (23 - n)) & 0x7fffff);
-        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (23 - n)) & 0x7fffff);
-        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (23 - n)) & 0x7fffff);
-        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (23 - n)) & 0x7fffff);
-    }
-    r[128] = (a[128] >> n) | ((a[129] << (23 - n)) & 0x7fffff);
-    r[129] = (a[129] >> n) | ((a[130] << (23 - n)) & 0x7fffff);
-    r[130] = (a[130] >> n) | ((a[131] << (23 - n)) & 0x7fffff);
-    r[131] = (a[131] >> n) | ((a[132] << (23 - n)) & 0x7fffff);
-    r[132] = (a[132] >> n) | ((a[133] << (23 - n)) & 0x7fffff);
-#endif
-    r[133] = a[133] >> n;
+    r[52] = a[52] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_32
-static WC_INLINE sp_digit sp_3072_div_word_134(sp_digit d1, sp_digit d0,
+static WC_INLINE sp_digit sp_3072_div_word_53(sp_digit d1, sp_digit d0,
     sp_digit dv)
 {
     sp_digit d;
     sp_digit r;
     sp_digit t;
 
-    /* All 23 bits from d1 and top 8 bits from d0. */
-    d = (d1 << 8) + (d0 >> 15);
+    /* All 29 bits from d1 and top 2 bits from d0. */
+    d = (d1 << 2) + (d0 >> 27);
     r = d / dv;
     d -= r * dv;
+    /* Up to 3 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 25) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 5 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 23) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 21) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
     /* Up to 9 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 7) & ((1 << 8) - 1);
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 19) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 11 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 17) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 15) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 15 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 13) & ((1 << 2) - 1);
     t = d / dv;
     d -= t * dv;
     r += t;
     /* Up to 17 bits in r */
-    /* Remaining 7 bits from d0. */
-    r <<= 7;
-    d <<= 7;
-    d += d0 & ((1 << 7) - 1);
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 11) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 9) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 7) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 23 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 5) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 3) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 27 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 1) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
     t = d / dv;
     r += t;
 
-    /* All 23 bits from d1 and top 8 bits from d0. */
+    /* All 29 bits from d1 and top 2 bits from d0. */
     return r;
 }
 #endif /* WOLFSSL_SP_DIV_32 */
@@ -6591,19 +5747,19 @@ static WC_INLINE sp_digit sp_3072_div_word_134(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_3072_div_134(const sp_digit* a, const sp_digit* d,
+static int sp_3072_div_53(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
+    sp_int64 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[4 * 134 + 3];
+    sp_digit t1[4 * 53 + 3];
 #endif
     sp_digit* t2 = NULL;
     sp_digit* sd = NULL;
@@ -6612,7 +5768,7 @@ static int sp_3072_div_134(const sp_digit* a, const sp_digit* d,
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 134 + 3), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 53 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
@@ -6621,53 +5777,60 @@ static int sp_3072_div_134(const sp_digit* a, const sp_digit* d,
     (void)m;
 
     if (err == MP_OKAY) {
-        t2 = t1 + 268 + 1;
-        sd = t2 + 134 + 1;
+        t2 = t1 + 106 + 1;
+        sd = t2 + 53 + 1;
 
-        sp_3072_mul_d_134(sd, d, (sp_digit)1 << 10);
-        sp_3072_mul_d_268(t1, a, (sp_digit)1 << 10);
-        dv = sd[133];
-        t1[134 + 134] += t1[134 + 134 - 1] >> 23;
-        t1[134 + 134 - 1] &= 0x7fffff;
-        for (i=134; i>=0; i--) {
+        sp_3072_mul_d_53(sd, d, (sp_digit)1 << 1);
+        sp_3072_mul_d_106(t1, a, (sp_digit)1 << 1);
+        dv = sd[52];
+        t1[53 + 53] += t1[53 + 53 - 1] >> 29;
+        t1[53 + 53 - 1] &= 0x1fffffff;
+        for (i=53; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_32
-            d1 = t1[134 + i];
-            d1 <<= 23;
-            d1 += t1[134 + i - 1];
+            d1 = t1[53 + i];
+            d1 <<= 29;
+            d1 += t1[53 + i - 1];
             r1 = (sp_digit)(d1 / dv);
 #else
-            r1 = sp_3072_div_word_134(t1[134 + i], t1[134 + i - 1], dv);
+            r1 = sp_3072_div_word_53(t1[53 + i], t1[53 + i - 1], dv);
 #endif
 
-            sp_3072_mul_d_134(t2, sd, r1);
-            (void)sp_3072_sub_134(&t1[i], &t1[i], t2);
-            sp_3072_norm_134(&t1[i]);
-            t1[134 + i] -= t2[134];
-            t1[134 + i] += t1[134 + i - 1] >> 23;
-            t1[134 + i - 1] &= 0x7fffff;
-            r1 = (((-t1[134 + i]) << 23) - t1[134 + i - 1]) / dv;
-            r1 -= t1[134 + i];
-            sp_3072_mul_d_134(t2, sd, r1);
-            (void)sp_3072_add_134(&t1[i], &t1[i], t2);
-            t1[134 + i] += t1[134 + i - 1] >> 23;
-            t1[134 + i - 1] &= 0x7fffff;
+            sp_3072_mul_d_53(t2, sd, r1);
+            (void)sp_3072_sub_53(&t1[i], &t1[i], t2);
+            sp_3072_norm_53(&t1[i]);
+            t1[53 + i] -= t2[53];
+            t1[53 + i] += t1[53 + i - 1] >> 29;
+            t1[53 + i - 1] &= 0x1fffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[53 + i];
+            d1 <<= 29;
+            d1 -= t1[53 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_53(-t1[53 + i], -t1[53 + i - 1], dv);
+#endif
+            r1 -= t1[53 + i];
+            sp_3072_mul_d_53(t2, sd, r1);
+            (void)sp_3072_add_53(&t1[i], &t1[i], t2);
+            t1[53 + i] += t1[53 + i - 1] >> 29;
+            t1[53 + i - 1] &= 0x1fffffff;
         }
-        t1[134 - 1] += t1[134 - 2] >> 23;
-        t1[134 - 2] &= 0x7fffff;
-        r1 = t1[134 - 1] / dv;
+        t1[53 - 1] += t1[53 - 2] >> 29;
+        t1[53 - 2] &= 0x1fffffff;
+        r1 = t1[53 - 1] / dv;
 
-        sp_3072_mul_d_134(t2, sd, r1);
-        sp_3072_sub_134(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 268U);
-        for (i=0; i<133; i++) {
-            r[i+1] += r[i] >> 23;
-            r[i] &= 0x7fffff;
+        sp_3072_mul_d_53(t2, sd, r1);
+        sp_3072_sub_53(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 106U);
+        for (i=0; i<52; i++) {
+            r[i+1] += r[i] >> 29;
+            r[i] &= 0x1fffffff;
         }
-        sp_3072_cond_add_134(r, r, sd, 0 - ((r[133] < 0) ?
+        sp_3072_cond_add_53(r, r, sd, 0 - ((r[52] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
 
-        sp_3072_norm_134(r);
-        sp_3072_rshift_134(r, r, 10);
+        sp_3072_norm_53(r);
+        sp_3072_rshift_53(r, r, 1);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -6685,13 +5848,11 @@ static int sp_3072_div_134(const sp_digit* a, const sp_digit* d,
  * m  A single precision number that is the modulus to reduce with.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_3072_mod_134(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static int sp_3072_mod_53(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
-    return sp_3072_div_134(a, m, NULL, r);
+    return sp_3072_div_53(a, m, NULL, r);
 }
 
-#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
-                                                     defined(WOLFSSL_HAVE_SP_DH)
 /* Modular exponentiate a to the e mod m. (r = a^e mod m)
  *
  * r     A single precision number that is the result of the operation.
@@ -6701,14 +5862,14 @@ static int sp_3072_mod_134(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * m     A single precision number that is the modulus.
  * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
  */
-static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e,
+static int sp_3072_mod_exp_53(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int bits, const sp_digit* m, int reduceA)
 {
 #if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 268];
+    sp_digit td[3 * 106];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -6720,7 +5881,7 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 134 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 53 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -6729,29 +5890,29 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 134 * 2);
-            XMEMSET(t[i], 0, sizeof(sp_digit) * 134U * 2U);
+            t[i] = td + (i * 53 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 53U * 2U);
         }
 
         sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_134(norm, m);
+        sp_3072_mont_norm_53(norm, m);
 
         if (reduceA != 0) {
-            err = sp_3072_mod_134(t[1], a, m);
+            err = sp_3072_mod_53(t[1], a, m);
         }
         else {
-            XMEMCPY(t[1], a, sizeof(sp_digit) * 134U);
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 53U);
         }
     }
     if (err == MP_OKAY) {
-        sp_3072_mul_134(t[1], t[1], norm);
-        err = sp_3072_mod_134(t[1], t[1], m);
+        sp_3072_mul_53(t[1], t[1], norm);
+        err = sp_3072_mod_53(t[1], t[1], m);
     }
 
     if (err == MP_OKAY) {
-        i = bits / 23;
-        c = bits % 23;
-        n = e[i--] << (23 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -6759,28 +5920,28 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
                 }
 
                 n = e[i--];
-                c = 23;
+                c = 29;
             }
 
-            y = (int)((n >> 22) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_3072_mont_mul_134(t[y^1], t[0], t[1], m, mp);
+            sp_3072_mont_mul_53(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 134 * 2);
-            sp_3072_mont_sqr_134(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 53 * 2);
+            sp_3072_mont_sqr_53(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 134 * 2);
+                            sizeof(*t[2]) * 53 * 2);
         }
 
-        sp_3072_mont_reduce_134(t[0], m, mp);
-        n = sp_3072_cmp_134(t[0], m);
-        sp_3072_cond_sub_134(t[0], t[0], m, ((n < 0) ?
+        sp_3072_mont_reduce_53(t[0], m, mp);
+        n = sp_3072_cmp_53(t[0], m);
+        sp_3072_cond_sub_53(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 134 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 53 * 2);
 
     }
 
@@ -6794,7 +5955,7 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 268];
+    sp_digit td[3 * 106];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -6806,7 +5967,7 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 134 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 53 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -6815,29 +5976,29 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 134 * 2);
+            t[i] = td + (i * 53 * 2);
         }
 
         sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_134(norm, m);
+        sp_3072_mont_norm_53(norm, m);
 
         if (reduceA != 0) {
-            err = sp_3072_mod_134(t[1], a, m);
+            err = sp_3072_mod_53(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_3072_mul_134(t[1], t[1], norm);
-                err = sp_3072_mod_134(t[1], t[1], m);
+                sp_3072_mul_53(t[1], t[1], norm);
+                err = sp_3072_mod_53(t[1], t[1], m);
             }
         }
         else {
-            sp_3072_mul_134(t[1], a, norm);
-            err = sp_3072_mod_134(t[1], t[1], m);
+            sp_3072_mul_53(t[1], a, norm);
+            err = sp_3072_mod_53(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        i = bits / 23;
-        c = bits % 23;
-        n = e[i--] << (23 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -6845,28 +6006,28 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
                 }
 
                 n = e[i--];
-                c = 23;
+                c = 29;
             }
 
-            y = (int)((n >> 22) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_3072_mont_mul_134(t[y^1], t[0], t[1], m, mp);
+            sp_3072_mont_mul_53(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 134 * 2);
-            sp_3072_mont_sqr_134(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 53 * 2);
+            sp_3072_mont_sqr_53(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 134 * 2);
+                            sizeof(*t[2]) * 53 * 2);
         }
 
-        sp_3072_mont_reduce_134(t[0], m, mp);
-        n = sp_3072_cmp_134(t[0], m);
-        sp_3072_cond_sub_134(t[0], t[0], m, ((n < 0) ?
+        sp_3072_mont_reduce_53(t[0], m, mp);
+        n = sp_3072_cmp_53(t[0], m);
+        sp_3072_cond_sub_53(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 134 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 53 * 2);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -6879,7 +6040,7 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[(32 * 268) + 268];
+    sp_digit td[(32 * 106) + 106];
 #endif
     sp_digit* t[32];
     sp_digit* rt = NULL;
@@ -6892,7 +6053,7 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 268) + 268), NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 106) + 106), NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -6901,64 +6062,64 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<32; i++)
-            t[i] = td + i * 268;
-        rt = td + 8576;
+            t[i] = td + i * 106;
+        rt = td + 3392;
 
         sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_134(norm, m);
+        sp_3072_mont_norm_53(norm, m);
 
         if (reduceA != 0) {
-            err = sp_3072_mod_134(t[1], a, m);
+            err = sp_3072_mod_53(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_3072_mul_134(t[1], t[1], norm);
-                err = sp_3072_mod_134(t[1], t[1], m);
+                sp_3072_mul_53(t[1], t[1], norm);
+                err = sp_3072_mod_53(t[1], t[1], m);
             }
         }
         else {
-            sp_3072_mul_134(t[1], a, norm);
-            err = sp_3072_mod_134(t[1], t[1], m);
+            sp_3072_mul_53(t[1], a, norm);
+            err = sp_3072_mod_53(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        sp_3072_mont_sqr_134(t[ 2], t[ 1], m, mp);
-        sp_3072_mont_mul_134(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_3072_mont_sqr_134(t[ 4], t[ 2], m, mp);
-        sp_3072_mont_mul_134(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_3072_mont_sqr_134(t[ 6], t[ 3], m, mp);
-        sp_3072_mont_mul_134(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_3072_mont_sqr_134(t[ 8], t[ 4], m, mp);
-        sp_3072_mont_mul_134(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_3072_mont_sqr_134(t[10], t[ 5], m, mp);
-        sp_3072_mont_mul_134(t[11], t[ 6], t[ 5], m, mp);
-        sp_3072_mont_sqr_134(t[12], t[ 6], m, mp);
-        sp_3072_mont_mul_134(t[13], t[ 7], t[ 6], m, mp);
-        sp_3072_mont_sqr_134(t[14], t[ 7], m, mp);
-        sp_3072_mont_mul_134(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_134(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_134(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_134(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_134(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_134(t[20], t[10], m, mp);
-        sp_3072_mont_mul_134(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_134(t[22], t[11], m, mp);
-        sp_3072_mont_mul_134(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_134(t[24], t[12], m, mp);
-        sp_3072_mont_mul_134(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_134(t[26], t[13], m, mp);
-        sp_3072_mont_mul_134(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_134(t[28], t[14], m, mp);
-        sp_3072_mont_mul_134(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_134(t[30], t[15], m, mp);
-        sp_3072_mont_mul_134(t[31], t[16], t[15], m, mp);
+        sp_3072_mont_sqr_53(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_53(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_53(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_53(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_53(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_53(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_3072_mont_sqr_53(t[ 8], t[ 4], m, mp);
+        sp_3072_mont_mul_53(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_3072_mont_sqr_53(t[10], t[ 5], m, mp);
+        sp_3072_mont_mul_53(t[11], t[ 6], t[ 5], m, mp);
+        sp_3072_mont_sqr_53(t[12], t[ 6], m, mp);
+        sp_3072_mont_mul_53(t[13], t[ 7], t[ 6], m, mp);
+        sp_3072_mont_sqr_53(t[14], t[ 7], m, mp);
+        sp_3072_mont_mul_53(t[15], t[ 8], t[ 7], m, mp);
+        sp_3072_mont_sqr_53(t[16], t[ 8], m, mp);
+        sp_3072_mont_mul_53(t[17], t[ 9], t[ 8], m, mp);
+        sp_3072_mont_sqr_53(t[18], t[ 9], m, mp);
+        sp_3072_mont_mul_53(t[19], t[10], t[ 9], m, mp);
+        sp_3072_mont_sqr_53(t[20], t[10], m, mp);
+        sp_3072_mont_mul_53(t[21], t[11], t[10], m, mp);
+        sp_3072_mont_sqr_53(t[22], t[11], m, mp);
+        sp_3072_mont_mul_53(t[23], t[12], t[11], m, mp);
+        sp_3072_mont_sqr_53(t[24], t[12], m, mp);
+        sp_3072_mont_mul_53(t[25], t[13], t[12], m, mp);
+        sp_3072_mont_sqr_53(t[26], t[13], m, mp);
+        sp_3072_mont_mul_53(t[27], t[14], t[13], m, mp);
+        sp_3072_mont_sqr_53(t[28], t[14], m, mp);
+        sp_3072_mont_mul_53(t[29], t[15], t[14], m, mp);
+        sp_3072_mont_sqr_53(t[30], t[15], m, mp);
+        sp_3072_mont_mul_53(t[31], t[16], t[15], m, mp);
 
         bits = ((bits + 4) / 5) * 5;
-        i = ((bits + 22) / 23) - 1;
-        c = bits % 23;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
         if (c == 0) {
-            c = 23;
+            c = 29;
         }
-        if (i < 134) {
+        if (i < 53) {
             n = e[i--] << (32 - c);
         }
         else {
@@ -6966,36 +6127,905 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
             i--;
         }
         if (c < 5) {
-            n |= e[i--] << (9 - c);
-            c += 23;
+            n |= e[i--] << (3 - c);
+            c += 29;
         }
         y = (int)((n >> 27) & 0x1f);
         n <<= 5;
         c -= 5;
-        XMEMCPY(rt, t[y], sizeof(sp_digit) * 268);
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 106);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (9 - c);
-                c += 23;
+            if (c >= 5) {
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c = 24;
+            }
+            else {
+                y = (byte)((n >> 27) & 0x1f);
+                n = e[i--] << 3;
+                c = 5 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
             }
-            y = (int)((n >> 27) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
-            sp_3072_mont_sqr_134(rt, rt, m, mp);
-            sp_3072_mont_sqr_134(rt, rt, m, mp);
-            sp_3072_mont_sqr_134(rt, rt, m, mp);
-            sp_3072_mont_sqr_134(rt, rt, m, mp);
-            sp_3072_mont_sqr_134(rt, rt, m, mp);
+            sp_3072_mont_sqr_53(rt, rt, m, mp);
+            sp_3072_mont_sqr_53(rt, rt, m, mp);
+            sp_3072_mont_sqr_53(rt, rt, m, mp);
+            sp_3072_mont_sqr_53(rt, rt, m, mp);
+            sp_3072_mont_sqr_53(rt, rt, m, mp);
 
-            sp_3072_mont_mul_134(rt, rt, t[y], m, mp);
+            sp_3072_mont_mul_53(rt, rt, t[y], m, mp);
         }
 
-        sp_3072_mont_reduce_134(rt, m, mp);
-        n = sp_3072_cmp_134(rt, m);
-        sp_3072_cond_sub_134(rt, rt, m, ((n < 0) ?
+        sp_3072_mont_reduce_53(rt, m, mp);
+        n = sp_3072_cmp_53(rt, m);
+        sp_3072_cond_sub_53(rt, rt, m, ((n < 0) ?
                    (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, rt, sizeof(sp_digit) * 268);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 106);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_106(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 106; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 3072 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_3072_mont_norm_106(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i=0; i<105; i++) {
+        r[i] = 0x1fffffff;
+    }
+    r[105] = 0x7ffffffL;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_3072_sub_106(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_3072_cmp_106(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i=105; i>=0; i--) {
+        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_sub_106(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 106; i++) {
+        r[i] = a[i] - (b[i] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_add_106(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 106; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0x1fffffff;
+        t >>= 29;
+    }
+    r[106] += (sp_digit)t;
+#else
+    sp_int64 tb = b;
+    sp_int64 t[4];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 104; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[0]  = t[3] >> 29;
+    }
+    t[0] += (tb * a[104]) + r[104];
+    t[1]  = (tb * a[105]) + r[105];
+    r[104] = t[0] & 0x1fffffff;
+    t[1] += t[0] >> 29;
+    r[105] = t[1] & 0x1fffffff;
+    r[106] +=  (sp_digit)(t[1] >> 29);
+#endif /* !WOLFSSL_SP_LARGE_CODE */
+}
+
+/* Shift the result in the high 3072 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_3072_mont_shift_106(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int64 n = a[105] >> 27;
+    n += ((sp_int64)a[106]) << 2;
+
+    for (i = 0; i < 105; i++) {
+        r[i] = n & 0x1fffffff;
+        n >>= 29;
+        n += ((sp_int64)a[107 + i]) << 2;
+    }
+    r[105] = (sp_digit)n;
+    XMEMSET(&r[106], 0, sizeof(*r) * 106U);
+}
+
+/* Reduce the number back to 3072 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_3072_mont_reduce_106(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_3072_norm_106(a + 106);
+
+#ifdef WOLFSSL_SP_DH
+    if (mp != 1) {
+        for (i=0; i<105; i++) {
+            mu = (a[i] * mp) & 0x1fffffff;
+            sp_3072_mul_add_106(a+i, m, mu);
+            a[i+1] += a[i] >> 29;
+        }
+        mu = (a[i] * mp) & 0x7ffffffL;
+        sp_3072_mul_add_106(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+    else {
+        for (i=0; i<105; i++) {
+            mu = a[i] & 0x1fffffff;
+            sp_3072_mul_add_106(a+i, m, mu);
+            a[i+1] += a[i] >> 29;
+        }
+        mu = a[i] & 0x7ffffffL;
+        sp_3072_mul_add_106(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+#else
+    for (i=0; i<105; i++) {
+        mu = (a[i] * mp) & 0x1fffffff;
+        sp_3072_mul_add_106(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
+    }
+    mu = (a[i] * mp) & 0x7ffffffL;
+    sp_3072_mul_add_106(a+i, m, mu);
+    a[i+1] += a[i] >> 29;
+    a[i] &= 0x1fffffff;
+#endif
+    sp_3072_mont_shift_106(a, a);
+    sp_3072_cond_sub_106(a, a, m, 0 - (((a[105] - m[105]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_3072_norm_106(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_mul_106(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_3072_mul_106(r, a, b);
+    sp_3072_mont_reduce_106(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_sqr_106(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_3072_sqr_106(r, a);
+    sp_3072_mont_reduce_106(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_d_212(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 212; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
+    }
+    r[212] = (sp_digit)t;
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_add_106(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 53; i++) {
+        r[i] = a[i] + (b[i] & m);
+    }
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_106(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 106; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_3072_rshift_106(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<105; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
+    }
+    r[105] = a[105] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_32
+static WC_INLINE sp_digit sp_3072_div_word_106(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 29 bits from d1 and top 2 bits from d0. */
+    d = (d1 << 2) + (d0 >> 27);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 3 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 25) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 5 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 23) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 21) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 9 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 19) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 11 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 17) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 15) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 15 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 13) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 17 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 11) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 9) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 7) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 23 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 5) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 3) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 27 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 1) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 29 bits from d1 and top 2 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_32 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_div_106(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_32
+    sp_int64 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 106 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 106 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 212 + 1;
+        sd = t2 + 106 + 1;
+
+        sp_3072_mul_d_106(sd, d, (sp_digit)1 << 2);
+        sp_3072_mul_d_212(t1, a, (sp_digit)1 << 2);
+        dv = sd[105];
+        t1[106 + 106] += t1[106 + 106 - 1] >> 29;
+        t1[106 + 106 - 1] &= 0x1fffffff;
+        for (i=106; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = t1[106 + i];
+            d1 <<= 29;
+            d1 += t1[106 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_106(t1[106 + i], t1[106 + i - 1], dv);
+#endif
+
+            sp_3072_mul_d_106(t2, sd, r1);
+            (void)sp_3072_sub_106(&t1[i], &t1[i], t2);
+            sp_3072_norm_106(&t1[i]);
+            t1[106 + i] -= t2[106];
+            t1[106 + i] += t1[106 + i - 1] >> 29;
+            t1[106 + i - 1] &= 0x1fffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[106 + i];
+            d1 <<= 29;
+            d1 -= t1[106 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_106(-t1[106 + i], -t1[106 + i - 1], dv);
+#endif
+            r1 -= t1[106 + i];
+            sp_3072_mul_d_106(t2, sd, r1);
+            (void)sp_3072_add_106(&t1[i], &t1[i], t2);
+            t1[106 + i] += t1[106 + i - 1] >> 29;
+            t1[106 + i - 1] &= 0x1fffffff;
+        }
+        t1[106 - 1] += t1[106 - 2] >> 29;
+        t1[106 - 2] &= 0x1fffffff;
+        r1 = t1[106 - 1] / dv;
+
+        sp_3072_mul_d_106(t2, sd, r1);
+        sp_3072_sub_106(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 212U);
+        for (i=0; i<105; i++) {
+            r[i+1] += r[i] >> 29;
+            r[i] &= 0x1fffffff;
+        }
+        sp_3072_cond_add_106(r, r, sd, 0 - ((r[105] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_3072_norm_106(r);
+        sp_3072_rshift_106(r, r, 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_mod_106(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_3072_div_106(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_106(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 212];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 106 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 106 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 106U * 2U);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_106(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_106(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 106U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_3072_mul_106(t[1], t[1], norm);
+        err = sp_3072_mod_106(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 29;
+            }
+
+            y = (int)((n >> 28) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_106(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 106 * 2);
+            sp_3072_mont_sqr_106(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 106 * 2);
+        }
+
+        sp_3072_mont_reduce_106(t[0], m, mp);
+        n = sp_3072_cmp_106(t[0], m);
+        sp_3072_cond_sub_106(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 106 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 212];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 106 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 106 * 2);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_106(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_106(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_106(t[1], t[1], norm);
+                err = sp_3072_mod_106(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_106(t[1], a, norm);
+            err = sp_3072_mod_106(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 29;
+            }
+
+            y = (int)((n >> 28) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_106(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 106 * 2);
+            sp_3072_mont_sqr_106(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 106 * 2);
+        }
+
+        sp_3072_mont_reduce_106(t[0], m, mp);
+        n = sp_3072_cmp_106(t[0], m);
+        sp_3072_cond_sub_106(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 106 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(16 * 212) + 212];
+#endif
+    sp_digit* t[16];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 212) + 212), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<16; i++)
+            t[i] = td + i * 212;
+        rt = td + 3392;
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_106(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_106(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_106(t[1], t[1], norm);
+                err = sp_3072_mod_106(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_106(t[1], a, norm);
+            err = sp_3072_mod_106(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_106(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_106(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_106(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_106(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_106(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_106(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_3072_mont_sqr_106(t[ 8], t[ 4], m, mp);
+        sp_3072_mont_mul_106(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_3072_mont_sqr_106(t[10], t[ 5], m, mp);
+        sp_3072_mont_mul_106(t[11], t[ 6], t[ 5], m, mp);
+        sp_3072_mont_sqr_106(t[12], t[ 6], m, mp);
+        sp_3072_mont_mul_106(t[13], t[ 7], t[ 6], m, mp);
+        sp_3072_mont_sqr_106(t[14], t[ 7], m, mp);
+        sp_3072_mont_mul_106(t[15], t[ 8], t[ 7], m, mp);
+
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
+        if (c == 0) {
+            c = 29;
+        }
+        if (i < 106) {
+            n = e[i--] << (32 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 4) {
+            n |= e[i--] << (3 - c);
+            c += 29;
+        }
+        y = (int)((n >> 28) & 0xf);
+        n <<= 4;
+        c -= 4;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 212);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 25;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 3;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
+            }
+
+            sp_3072_mont_sqr_106(rt, rt, m, mp);
+            sp_3072_mont_sqr_106(rt, rt, m, mp);
+            sp_3072_mont_sqr_106(rt, rt, m, mp);
+            sp_3072_mont_sqr_106(rt, rt, m, mp);
+
+            sp_3072_mont_mul_106(rt, rt, t[y], m, mp);
+        }
+
+        sp_3072_mont_reduce_106(rt, m, mp);
+        n = sp_3072_cmp_106(rt, m);
+        sp_3072_cond_sub_106(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 212);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -7006,8 +7036,6 @@ static int sp_3072_mod_exp_134(sp_digit* r, const sp_digit* a, const sp_digit* e
     return err;
 #endif
 }
-#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */
-       /* WOLFSSL_HAVE_SP_DH */
 
 #ifdef WOLFSSL_HAVE_SP_RSA
 /* RSA public key operation.
@@ -7029,7 +7057,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[134 * 5];
+    sp_digit a[106 * 5];
 #endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
@@ -7044,7 +7072,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        if (mp_count_bits(em) > 23) {
+        if (mp_count_bits(em) > 29) {
             err = MP_READ_E;
         }
         else if (inLen > 384U) {
@@ -7060,7 +7088,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 134 * 5, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 5, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -7068,12 +7096,12 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
 #endif
 
     if (err == MP_OKAY) {
-        r = a + 134 * 2;
-        m = r + 134 * 2;
+        r = a + 106 * 2;
+        m = r + 106 * 2;
         norm = r;
 
-        sp_3072_from_bin(a, 134, in, inLen);
-#if DIGIT_BIT >= 23
+        sp_3072_from_bin(a, 106, in, inLen);
+#if DIGIT_BIT >= 29
         e[0] = (sp_digit)em->dp[0];
 #else
         e[0] = (sp_digit)em->dp[0];
@@ -7087,36 +7115,36 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_from_mp(m, 134, mm);
+        sp_3072_from_mp(m, 106, mm);
 
         sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_134(norm, m);
+        sp_3072_mont_norm_106(norm, m);
     }
     if (err == MP_OKAY) {
-        sp_3072_mul_134(a, a, norm);
-        err = sp_3072_mod_134(a, a, m);
+        sp_3072_mul_106(a, a, norm);
+        err = sp_3072_mod_106(a, a, m);
     }
     if (err == MP_OKAY) {
-        for (i=22; i>=0; i--) {
+        for (i=28; i>=0; i--) {
             if ((e[0] >> i) != 0) {
                 break;
             }
         }
 
-        XMEMCPY(r, a, sizeof(sp_digit) * 134 * 2);
+        XMEMCPY(r, a, sizeof(sp_digit) * 106 * 2);
         for (i--; i>=0; i--) {
-            sp_3072_mont_sqr_134(r, r, m, mp);
+            sp_3072_mont_sqr_106(r, r, m, mp);
 
             if (((e[0] >> i) & 1) == 1) {
-                sp_3072_mont_mul_134(r, r, a, m, mp);
+                sp_3072_mont_mul_106(r, r, a, m, mp);
             }
         }
-        sp_3072_mont_reduce_134(r, m, mp);
-        mp = sp_3072_cmp_134(r, m);
-        sp_3072_cond_sub_134(r, r, m, ((mp < 0) ?
+        sp_3072_mont_reduce_106(r, m, mp);
+        mp = sp_3072_cmp_106(r, m);
+        sp_3072_cond_sub_106(r, r, m, ((mp < 0) ?
                     (sp_digit)1 : (sp_digit)0)- 1);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_106(r, out);
         *outLen = 384;
     }
 
@@ -7130,7 +7158,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit d[134 * 5];
+    sp_digit d[106 * 5];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -7142,7 +7170,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
         err = MP_TO_E;
     }
     if (err == MP_OKAY) {
-        if (mp_count_bits(em) > 23) {
+        if (mp_count_bits(em) > 29) {
             err = MP_READ_E;
         }
         else if (inLen > 384U) {
@@ -7158,7 +7186,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 134 * 5, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 5, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -7167,11 +7195,11 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
 
     if (err == MP_OKAY) {
         a = d;
-        r = a + 134 * 2;
-        m = r + 134 * 2;
+        r = a + 106 * 2;
+        m = r + 106 * 2;
 
-        sp_3072_from_bin(a, 134, in, inLen);
-#if DIGIT_BIT >= 23
+        sp_3072_from_bin(a, 106, in, inLen);
+#if DIGIT_BIT >= 29
         e[0] = (sp_digit)em->dp[0];
 #else
         e[0] = (sp_digit)em->dp[0];
@@ -7184,14 +7212,14 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
         }
     }
     if (err == MP_OKAY) {
-        sp_3072_from_mp(m, 134, mm);
+        sp_3072_from_mp(m, 106, mm);
 
         if (e[0] == 0x3) {
-            sp_3072_sqr_134(r, a);
-            err = sp_3072_mod_134(r, r, m);
+            sp_3072_sqr_106(r, a);
+            err = sp_3072_mod_106(r, r, m);
             if (err == MP_OKAY) {
-                sp_3072_mul_134(r, a, r);
-                err = sp_3072_mod_134(r, r, m);
+                sp_3072_mul_106(r, a, r);
+                err = sp_3072_mod_106(r, r, m);
             }
         }
         else {
@@ -7200,36 +7228,36 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
             sp_digit mp;
 
             sp_3072_mont_setup(m, &mp);
-            sp_3072_mont_norm_134(norm, m);
+            sp_3072_mont_norm_106(norm, m);
 
-            sp_3072_mul_134(a, a, norm);
-            err = sp_3072_mod_134(a, a, m);
+            sp_3072_mul_106(a, a, norm);
+            err = sp_3072_mod_106(a, a, m);
 
             if (err == MP_OKAY) {
-                for (i=22; i>=0; i--) {
+                for (i=28; i>=0; i--) {
                     if ((e[0] >> i) != 0) {
                         break;
                     }
                 }
 
-                XMEMCPY(r, a, sizeof(sp_digit) * 268U);
+                XMEMCPY(r, a, sizeof(sp_digit) * 212U);
                 for (i--; i>=0; i--) {
-                    sp_3072_mont_sqr_134(r, r, m, mp);
+                    sp_3072_mont_sqr_106(r, r, m, mp);
 
                     if (((e[0] >> i) & 1) == 1) {
-                        sp_3072_mont_mul_134(r, r, a, m, mp);
+                        sp_3072_mont_mul_106(r, r, a, m, mp);
                     }
                 }
-                sp_3072_mont_reduce_134(r, m, mp);
-                mp = sp_3072_cmp_134(r, m);
-                sp_3072_cond_sub_134(r, r, m, ((mp < 0) ?
+                sp_3072_mont_reduce_106(r, m, mp);
+                mp = sp_3072_cmp_106(r, m);
+                sp_3072_cond_sub_106(r, r, m, ((mp < 0) ?
                            (sp_digit)1 : (sp_digit)0) - 1);
             }
         }
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_106(r, out);
         *outLen = 384;
     }
 
@@ -7271,7 +7299,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit  d[134 * 4];
+    sp_digit  d[106 * 4];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -7304,7 +7332,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 134 * 4, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -7312,18 +7340,18 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        a = d + 134;
-        m = a + 268;
+        a = d + 106;
+        m = a + 212;
         r = a;
 
-        sp_3072_from_bin(a, 134, in, inLen);
-        sp_3072_from_mp(d, 134, dm);
-        sp_3072_from_mp(m, 134, mm);
-        err = sp_3072_mod_exp_134(r, a, d, 3072, m, 0);
+        sp_3072_from_bin(a, 106, in, inLen);
+        sp_3072_from_mp(d, 106, dm);
+        sp_3072_from_mp(m, 106, mm);
+        err = sp_3072_mod_exp_106(r, a, d, 3072, m, 0);
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_106(r, out);
         *outLen = 384;
     }
 
@@ -7333,7 +7361,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     {
         /* only "a" and "r" are sensitive and need zeroized (same pointer) */
         if (a != NULL)
-            ForceZero(a, sizeof(sp_digit) * 134);
+            ForceZero(a, sizeof(sp_digit) * 106);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(d, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -7344,7 +7372,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit d[134 * 4];
+    sp_digit d[106 * 4];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -7377,7 +7405,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 134 * 4, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -7385,18 +7413,18 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        a = d + 134;
-        m = a + 268;
+        a = d + 106;
+        m = a + 212;
         r = a;
 
-        sp_3072_from_bin(a, 134, in, inLen);
-        sp_3072_from_mp(d, 134, dm);
-        sp_3072_from_mp(m, 134, mm);
-        err = sp_3072_mod_exp_134(r, a, d, 3072, m, 0);
+        sp_3072_from_bin(a, 106, in, inLen);
+        sp_3072_from_mp(d, 106, dm);
+        sp_3072_from_mp(m, 106, mm);
+        err = sp_3072_mod_exp_106(r, a, d, 3072, m, 0);
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_106(r, out);
         *outLen = 384;
     }
 
@@ -7406,7 +7434,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     {
         /* only "a" and "r" are sensitive and need zeroized (same pointer) */
         if (a != NULL)
-            ForceZero(a, sizeof(sp_digit) * 134);
+            ForceZero(a, sizeof(sp_digit) * 106);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(d, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -7419,10 +7447,9 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[67 * 11];
+    sp_digit a[53 * 8];
 #endif
     sp_digit* p = NULL;
-    sp_digit* q = NULL;
     sp_digit* dp = NULL;
     sp_digit* dq = NULL;
     sp_digit* qi = NULL;
@@ -7451,47 +7478,48 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 67 * 11, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 53 * 8, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
     }
 #endif
     if (err == MP_OKAY) {
-        p = a + 134 * 2;
-        q = p + 67;
-        qi = dq = dp = q + 67;
-        tmpa = qi + 67;
-        tmpb = tmpa + 134;
-        r = a + 134;
+        p = a + 106;
+        qi = dq = dp = p + 53;
+        tmpa = qi + 53;
+        tmpb = tmpa + 106;
+        r = a;
 
-        sp_3072_from_bin(a, 134, in, inLen);
-        sp_3072_from_mp(p, 67, pm);
-        sp_3072_from_mp(q, 67, qm);
-        sp_3072_from_mp(dp, 67, dpm);
-        err = sp_3072_mod_exp_67(tmpa, a, dp, 1536, p, 1);
+        sp_3072_from_bin(a, 106, in, inLen);
+        sp_3072_from_mp(p, 53, pm);
+        sp_3072_from_mp(dp, 53, dpm);
+        err = sp_3072_mod_exp_53(tmpa, a, dp, 1536, p, 1);
     }
     if (err == MP_OKAY) {
-        sp_3072_from_mp(dq, 67, dqm);
-        err = sp_3072_mod_exp_67(tmpb, a, dq, 1536, q, 1);
+        sp_3072_from_mp(p, 53, qm);
+        sp_3072_from_mp(dq, 53, dqm);
+        err = sp_3072_mod_exp_53(tmpb, a, dq, 1536, p, 1);
     }
     if (err == MP_OKAY) {
-        (void)sp_3072_sub_67(tmpa, tmpa, tmpb);
-        sp_3072_norm_67(tmpa);
-        sp_3072_cond_add_67(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[66] >> 31));
-        sp_3072_cond_add_67(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[66] >> 31));
+        sp_3072_from_mp(p, 53, pm);
+        (void)sp_3072_sub_53(tmpa, tmpa, tmpb);
+        sp_3072_norm_53(tmpa);
+        sp_3072_cond_add_53(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[52] >> 31));
+        sp_3072_cond_add_53(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[52] >> 31));
 
-        sp_3072_from_mp(qi, 67, qim);
-        sp_3072_mul_67(tmpa, tmpa, qi);
-        err = sp_3072_mod_67(tmpa, tmpa, p);
+        sp_3072_from_mp(qi, 53, qim);
+        sp_3072_mul_53(tmpa, tmpa, qi);
+        err = sp_3072_mod_53(tmpa, tmpa, p);
     }
 
     if (err == MP_OKAY) {
-        sp_3072_mul_67(tmpa, q, tmpa);
-        (void)sp_3072_add_134(r, tmpb, tmpa);
-        sp_3072_norm_134(r);
+        sp_3072_from_mp(p, 53, qm);
+        sp_3072_mul_53(tmpa, p, tmpa);
+        (void)sp_3072_add_106(r, tmpb, tmpa);
+        sp_3072_norm_106(r);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_106(r, out);
         *outLen = 384;
     }
 
@@ -7499,7 +7527,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 67 * 11);
+        ForceZero(a, sizeof(sp_digit) * 53 * 8);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -7510,7 +7538,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[67 * 13];
+    sp_digit a[53 * 13];
 #endif
     sp_digit* p = NULL;
     sp_digit* q = NULL;
@@ -7542,7 +7570,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 67 * 13, NULL, 
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 53 * 13, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -7550,43 +7578,43 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        p = a + 134 * 2;
-        q = p + 67;
-        dp = q + 67;
-        dq = dp + 67;
-        qi = dq + 67;
-        tmpa = qi + 67;
-        tmpb = tmpa + 134;
+        p = a + 106 * 2;
+        q = p + 53;
+        dp = q + 53;
+        dq = dp + 53;
+        qi = dq + 53;
+        tmpa = qi + 53;
+        tmpb = tmpa + 106;
         r = a;
 
-        sp_3072_from_bin(a, 134, in, inLen);
-        sp_3072_from_mp(p, 67, pm);
-        sp_3072_from_mp(q, 67, qm);
-        sp_3072_from_mp(dp, 67, dpm);
-        sp_3072_from_mp(dq, 67, dqm);
-        sp_3072_from_mp(qi, 67, qim);
+        sp_3072_from_bin(a, 106, in, inLen);
+        sp_3072_from_mp(p, 53, pm);
+        sp_3072_from_mp(q, 53, qm);
+        sp_3072_from_mp(dp, 53, dpm);
+        sp_3072_from_mp(dq, 53, dqm);
+        sp_3072_from_mp(qi, 53, qim);
 
-        err = sp_3072_mod_exp_67(tmpa, a, dp, 1536, p, 1);
+        err = sp_3072_mod_exp_53(tmpa, a, dp, 1536, p, 1);
     }
     if (err == MP_OKAY) {
-        err = sp_3072_mod_exp_67(tmpb, a, dq, 1536, q, 1);
+        err = sp_3072_mod_exp_53(tmpb, a, dq, 1536, q, 1);
     }
 
     if (err == MP_OKAY) {
-        (void)sp_3072_sub_67(tmpa, tmpa, tmpb);
-        sp_3072_norm_67(tmpa);
-        sp_3072_cond_add_67(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[66] >> 31));
-        sp_3072_cond_add_67(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[66] >> 31));
-        sp_3072_mul_67(tmpa, tmpa, qi);
-        err = sp_3072_mod_67(tmpa, tmpa, p);
+        (void)sp_3072_sub_53(tmpa, tmpa, tmpb);
+        sp_3072_norm_53(tmpa);
+        sp_3072_cond_add_53(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[52] >> 31));
+        sp_3072_cond_add_53(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[52] >> 31));
+        sp_3072_mul_53(tmpa, tmpa, qi);
+        err = sp_3072_mod_53(tmpa, tmpa, p);
     }
 
     if (err == MP_OKAY) {
-        sp_3072_mul_67(tmpa, tmpa, q);
-        (void)sp_3072_add_134(r, tmpb, tmpa);
-        sp_3072_norm_134(r);
+        sp_3072_mul_53(tmpa, tmpa, q);
+        (void)sp_3072_add_106(r, tmpb, tmpa);
+        sp_3072_norm_106(r);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_106(r, out);
         *outLen = 384;
     }
 
@@ -7594,7 +7622,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 67 * 13);
+        ForceZero(a, sizeof(sp_digit) * 53 * 13);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
     #endif
@@ -7620,22 +7648,22 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
 
     err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
     if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
-#if DIGIT_BIT == 23
-        XMEMCPY(r->dp, a, sizeof(sp_digit) * 134);
-        r->used = 134;
+#if DIGIT_BIT == 29
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 106);
+        r->used = 106;
         mp_clamp(r);
-#elif DIGIT_BIT < 23
+#elif DIGIT_BIT < 29
         int i;
         int j = 0;
         int s = 0;
 
         r->dp[0] = 0;
-        for (i = 0; i < 134; i++) {
+        for (i = 0; i < 106; i++) {
             r->dp[j] |= (mp_digit)(a[i] << s);
             r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
             s = DIGIT_BIT - s;
             r->dp[++j] = (mp_digit)(a[i] >> s);
-            while (s + DIGIT_BIT <= 23) {
+            while (s + DIGIT_BIT <= 29) {
                 s += DIGIT_BIT;
                 r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                 if (s == SP_WORD_SIZE) {
@@ -7645,7 +7673,7 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
                     r->dp[j] = (mp_digit)(a[i] >> s);
                 }
             }
-            s = 23 - s;
+            s = 29 - s;
         }
         r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
         mp_clamp(r);
@@ -7655,18 +7683,18 @@ static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
         int s = 0;
 
         r->dp[0] = 0;
-        for (i = 0; i < 134; i++) {
+        for (i = 0; i < 106; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
-            if (s + 23 >= DIGIT_BIT) {
+            if (s + 29 >= DIGIT_BIT) {
     #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;
                 r->dp[++j] = a[i] >> s;
-                s = 23 - s;
+                s = 29 - s;
             }
             else {
-                s += 23;
+                s += 29;
             }
         }
         r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
@@ -7694,7 +7722,7 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[134 * 4];
+    sp_digit b[106 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -7716,7 +7744,7 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 134 * 4, NULL,
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL,
             DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
@@ -7724,15 +7752,15 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 134 * 2;
-        m = e + 134;
+        e = b + 106 * 2;
+        m = e + 106;
         r = b;
 
-        sp_3072_from_mp(b, 134, base);
-        sp_3072_from_mp(e, 134, exp);
-        sp_3072_from_mp(m, 134, mod);
+        sp_3072_from_mp(b, 106, base);
+        sp_3072_from_mp(e, 106, exp);
+        sp_3072_from_mp(m, 106, mod);
 
-        err = sp_3072_mod_exp_134(r, b, e, mp_count_bits(exp), m, 0);
+        err = sp_3072_mod_exp_106(r, b, e, mp_count_bits(exp), m, 0);
     }
 
     if (err == MP_OKAY) {
@@ -7745,7 +7773,7 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 134U);
+            ForceZero(e, sizeof(sp_digit) * 106U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -7755,7 +7783,7 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[134 * 4];
+    sp_digit b[106 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -7778,22 +7806,22 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 134 * 4, NULL, DYNAMIC_TYPE_DH);
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL, DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
     }
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 134 * 2;
-        m = e + 134;
+        e = b + 106 * 2;
+        m = e + 106;
         r = b;
 
-        sp_3072_from_mp(b, 134, base);
-        sp_3072_from_mp(e, 134, exp);
-        sp_3072_from_mp(m, 134, mod);
+        sp_3072_from_mp(b, 106, base);
+        sp_3072_from_mp(e, 106, exp);
+        sp_3072_from_mp(m, 106, mod);
 
-        err = sp_3072_mod_exp_134(r, b, e, expBits, m, 0);
+        err = sp_3072_mod_exp_106(r, b, e, expBits, m, 0);
     }
 
     if (err == MP_OKAY) {
@@ -7807,7 +7835,7 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 134U);
+            ForceZero(e, sizeof(sp_digit) * 106U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -7820,290 +7848,16 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #ifdef WOLFSSL_HAVE_SP_DH
 
 #ifdef HAVE_FFDHE_3072
-SP_NOINLINE static void sp_3072_lshift_134(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_lshift_106(sp_digit* r, const sp_digit* a,
         byte n)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    r[134] = a[133] >> (23 - n);
-    for (i=133; i>0; i--) {
-        r[i] = ((a[i] << n) | (a[i-1] >> (23 - n))) & 0x7fffff;
+    r[106] = a[105] >> (29 - n);
+    for (i=105; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (29 - n))) & 0x1fffffff;
     }
-#else
-    sp_int_digit s;
-    sp_int_digit t;
-
-    s = (sp_int_digit)a[133];
-    r[134] = s >> (23U - n);
-    s = (sp_int_digit)(a[133]); t = (sp_int_digit)(a[132]);
-    r[133] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[132]); t = (sp_int_digit)(a[131]);
-    r[132] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[131]); t = (sp_int_digit)(a[130]);
-    r[131] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[130]); t = (sp_int_digit)(a[129]);
-    r[130] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[129]); t = (sp_int_digit)(a[128]);
-    r[129] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[128]); t = (sp_int_digit)(a[127]);
-    r[128] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[127]); t = (sp_int_digit)(a[126]);
-    r[127] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[126]); t = (sp_int_digit)(a[125]);
-    r[126] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[125]); t = (sp_int_digit)(a[124]);
-    r[125] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[124]); t = (sp_int_digit)(a[123]);
-    r[124] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[123]); t = (sp_int_digit)(a[122]);
-    r[123] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[122]); t = (sp_int_digit)(a[121]);
-    r[122] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[121]); t = (sp_int_digit)(a[120]);
-    r[121] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[120]); t = (sp_int_digit)(a[119]);
-    r[120] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[119]); t = (sp_int_digit)(a[118]);
-    r[119] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[118]); t = (sp_int_digit)(a[117]);
-    r[118] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[117]); t = (sp_int_digit)(a[116]);
-    r[117] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[116]); t = (sp_int_digit)(a[115]);
-    r[116] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[115]); t = (sp_int_digit)(a[114]);
-    r[115] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[114]); t = (sp_int_digit)(a[113]);
-    r[114] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[113]); t = (sp_int_digit)(a[112]);
-    r[113] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[112]); t = (sp_int_digit)(a[111]);
-    r[112] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[111]); t = (sp_int_digit)(a[110]);
-    r[111] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[110]); t = (sp_int_digit)(a[109]);
-    r[110] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[109]); t = (sp_int_digit)(a[108]);
-    r[109] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[108]); t = (sp_int_digit)(a[107]);
-    r[108] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[107]); t = (sp_int_digit)(a[106]);
-    r[107] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[106]); t = (sp_int_digit)(a[105]);
-    r[106] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[105]); t = (sp_int_digit)(a[104]);
-    r[105] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[104]); t = (sp_int_digit)(a[103]);
-    r[104] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[103]); t = (sp_int_digit)(a[102]);
-    r[103] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[102]); t = (sp_int_digit)(a[101]);
-    r[102] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[101]); t = (sp_int_digit)(a[100]);
-    r[101] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[100]); t = (sp_int_digit)(a[99]);
-    r[100] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[99]); t = (sp_int_digit)(a[98]);
-    r[99] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[98]); t = (sp_int_digit)(a[97]);
-    r[98] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[97]); t = (sp_int_digit)(a[96]);
-    r[97] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[96]); t = (sp_int_digit)(a[95]);
-    r[96] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[95]); t = (sp_int_digit)(a[94]);
-    r[95] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[94]); t = (sp_int_digit)(a[93]);
-    r[94] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[93]); t = (sp_int_digit)(a[92]);
-    r[93] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[92]); t = (sp_int_digit)(a[91]);
-    r[92] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[91]); t = (sp_int_digit)(a[90]);
-    r[91] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[90]); t = (sp_int_digit)(a[89]);
-    r[90] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[89]); t = (sp_int_digit)(a[88]);
-    r[89] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[88]); t = (sp_int_digit)(a[87]);
-    r[88] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[87]); t = (sp_int_digit)(a[86]);
-    r[87] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[86]); t = (sp_int_digit)(a[85]);
-    r[86] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[85]); t = (sp_int_digit)(a[84]);
-    r[85] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[84]); t = (sp_int_digit)(a[83]);
-    r[84] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[83]); t = (sp_int_digit)(a[82]);
-    r[83] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[82]); t = (sp_int_digit)(a[81]);
-    r[82] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[81]); t = (sp_int_digit)(a[80]);
-    r[81] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[80]); t = (sp_int_digit)(a[79]);
-    r[80] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[79]); t = (sp_int_digit)(a[78]);
-    r[79] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[78]); t = (sp_int_digit)(a[77]);
-    r[78] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[77]); t = (sp_int_digit)(a[76]);
-    r[77] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[76]); t = (sp_int_digit)(a[75]);
-    r[76] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[75]); t = (sp_int_digit)(a[74]);
-    r[75] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[74]); t = (sp_int_digit)(a[73]);
-    r[74] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[73]); t = (sp_int_digit)(a[72]);
-    r[73] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[72]); t = (sp_int_digit)(a[71]);
-    r[72] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[71]); t = (sp_int_digit)(a[70]);
-    r[71] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[70]); t = (sp_int_digit)(a[69]);
-    r[70] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[69]); t = (sp_int_digit)(a[68]);
-    r[69] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[68]); t = (sp_int_digit)(a[67]);
-    r[68] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[67]); t = (sp_int_digit)(a[66]);
-    r[67] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[66]); t = (sp_int_digit)(a[65]);
-    r[66] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[65]); t = (sp_int_digit)(a[64]);
-    r[65] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[64]); t = (sp_int_digit)(a[63]);
-    r[64] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[63]); t = (sp_int_digit)(a[62]);
-    r[63] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[62]); t = (sp_int_digit)(a[61]);
-    r[62] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[61]); t = (sp_int_digit)(a[60]);
-    r[61] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[60]); t = (sp_int_digit)(a[59]);
-    r[60] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[59]); t = (sp_int_digit)(a[58]);
-    r[59] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[58]); t = (sp_int_digit)(a[57]);
-    r[58] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[57]); t = (sp_int_digit)(a[56]);
-    r[57] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[56]); t = (sp_int_digit)(a[55]);
-    r[56] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[55]); t = (sp_int_digit)(a[54]);
-    r[55] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[54]); t = (sp_int_digit)(a[53]);
-    r[54] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[53]); t = (sp_int_digit)(a[52]);
-    r[53] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[52]); t = (sp_int_digit)(a[51]);
-    r[52] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[51]); t = (sp_int_digit)(a[50]);
-    r[51] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[50]); t = (sp_int_digit)(a[49]);
-    r[50] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[49]); t = (sp_int_digit)(a[48]);
-    r[49] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[48]); t = (sp_int_digit)(a[47]);
-    r[48] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[47]); t = (sp_int_digit)(a[46]);
-    r[47] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[46]); t = (sp_int_digit)(a[45]);
-    r[46] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[45]); t = (sp_int_digit)(a[44]);
-    r[45] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[44]); t = (sp_int_digit)(a[43]);
-    r[44] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[43]); t = (sp_int_digit)(a[42]);
-    r[43] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[42]); t = (sp_int_digit)(a[41]);
-    r[42] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[41]); t = (sp_int_digit)(a[40]);
-    r[41] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[40]); t = (sp_int_digit)(a[39]);
-    r[40] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[39]); t = (sp_int_digit)(a[38]);
-    r[39] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[38]); t = (sp_int_digit)(a[37]);
-    r[38] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[37]); t = (sp_int_digit)(a[36]);
-    r[37] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[36]); t = (sp_int_digit)(a[35]);
-    r[36] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[35]); t = (sp_int_digit)(a[34]);
-    r[35] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[34]); t = (sp_int_digit)(a[33]);
-    r[34] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[33]); t = (sp_int_digit)(a[32]);
-    r[33] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[32]); t = (sp_int_digit)(a[31]);
-    r[32] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[31]); t = (sp_int_digit)(a[30]);
-    r[31] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[30]); t = (sp_int_digit)(a[29]);
-    r[30] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
-    r[29] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
-    r[28] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
-    r[27] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
-    r[26] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
-    r[25] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
-    r[24] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
-    r[23] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
-    r[22] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
-    r[21] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
-    r[20] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
-    r[19] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
-    r[18] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
-    r[17] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
-    r[16] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
-    r[15] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
-    r[14] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
-    r[13] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
-    r[12] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
-    r[11] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
-    r[10] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
-    r[9] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
-    r[8] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
-    r[7] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
-    r[6] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
-    r[5] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
-    r[4] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
-    r[3] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
-    r[2] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
-    r[1] = ((s << n) | (t >> (23U - n))) & 0x7fffff;
-#endif
-    r[0] = (a[0] << n) & 0x7fffff;
+    r[0] = (a[0] << n) & 0x1fffffff;
 }
 
 /* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
@@ -8114,12 +7868,12 @@ SP_NOINLINE static void sp_3072_lshift_134(sp_digit* r, const sp_digit* a,
  * m     A single precision number that is the modulus.
  * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
  */
-static int sp_3072_mod_exp_2_134(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
+static int sp_3072_mod_exp_2_106(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
 {
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[403];
+    sp_digit td[319];
 #endif
     sp_digit* norm = NULL;
     sp_digit* tmp = NULL;
@@ -8132,7 +7886,7 @@ static int sp_3072_mod_exp_2_134(sp_digit* r, const sp_digit* e, int bits, const
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 403, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 319, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -8140,19 +7894,19 @@ static int sp_3072_mod_exp_2_134(sp_digit* r, const sp_digit* e, int bits, const
 
     if (err == MP_OKAY) {
         norm = td;
-        tmp  = td + 268;
-        XMEMSET(td, 0, sizeof(sp_digit) * 403);
+        tmp  = td + 212;
+        XMEMSET(td, 0, sizeof(sp_digit) * 319);
 
         sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_134(norm, m);
+        sp_3072_mont_norm_106(norm, m);
 
         bits = ((bits + 3) / 4) * 4;
-        i = ((bits + 22) / 23) - 1;
-        c = bits % 23;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
         if (c == 0) {
-            c = 23;
+            c = 29;
         }
-        if (i < 134) {
+        if (i < 106) {
             n = e[i--] << (32 - c);
         }
         else {
@@ -8160,41 +7914,53 @@ static int sp_3072_mod_exp_2_134(sp_digit* r, const sp_digit* e, int bits, const
             i--;
         }
         if (c < 4) {
-            n |= e[i--] << (9 - c);
-            c += 23;
+            n |= e[i--] << (3 - c);
+            c += 29;
         }
         y = (int)((n >> 28) & 0xf);
         n <<= 4;
         c -= 4;
-        sp_3072_lshift_134(r, norm, (byte)y);
+        sp_3072_lshift_106(r, norm, (byte)y);
         while ((i >= 0) || (c >= 4)) {
-            if (c < 4) {
-                n |= e[i--] << (9 - c);
-                c += 23;
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 25;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 3;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
             }
-            y = (int)((n >> 28) & 0xf);
-            n <<= 4;
-            c -= 4;
 
-            sp_3072_mont_sqr_134(r, r, m, mp);
-            sp_3072_mont_sqr_134(r, r, m, mp);
-            sp_3072_mont_sqr_134(r, r, m, mp);
-            sp_3072_mont_sqr_134(r, r, m, mp);
+            sp_3072_mont_sqr_106(r, r, m, mp);
+            sp_3072_mont_sqr_106(r, r, m, mp);
+            sp_3072_mont_sqr_106(r, r, m, mp);
+            sp_3072_mont_sqr_106(r, r, m, mp);
 
-            sp_3072_lshift_134(r, r, (byte)y);
-            sp_3072_mul_d_134(tmp, norm, (r[134] << 10) + (r[133] >> 13));
-            r[134] = 0;
-            r[133] &= 0x1fffL;
-            (void)sp_3072_add_134(r, r, tmp);
-            sp_3072_norm_134(r);
-            o = sp_3072_cmp_134(r, m);
-            sp_3072_cond_sub_134(r, r, m, ((o < 0) ?
+            sp_3072_lshift_106(r, r, (byte)y);
+            sp_3072_mul_d_106(tmp, norm, (r[106] << 2) + (r[105] >> 27));
+            r[106] = 0;
+            r[105] &= 0x7ffffffL;
+            (void)sp_3072_add_106(r, r, tmp);
+            sp_3072_norm_106(r);
+            o = sp_3072_cmp_106(r, m);
+            sp_3072_cond_sub_106(r, r, m, ((o < 0) ?
                                           (sp_digit)1 : (sp_digit)0) - 1);
         }
 
-        sp_3072_mont_reduce_134(r, m, mp);
-        n = sp_3072_cmp_134(r, m);
-        sp_3072_cond_sub_134(r, r, m, ((n < 0) ?
+        sp_3072_mont_reduce_106(r, m, mp);
+        n = sp_3072_cmp_106(r, m);
+        sp_3072_cond_sub_106(r, r, m, ((n < 0) ?
                                                 (sp_digit)1 : (sp_digit)0) - 1);
     }
 
@@ -8223,84 +7989,10 @@ static int sp_3072_mod_exp_2_134(sp_digit* r, const sp_digit* e, int bits, const
 int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     const mp_int* mod, byte* out, word32* outLen)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int err = MP_OKAY;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[134 * 4];
-#endif
-    sp_digit* e = NULL;
-    sp_digit* m = NULL;
-    sp_digit* r = NULL;
-    word32 i;
-
-    if (mp_count_bits(base) > 3072) {
-        err = MP_READ_E;
-    }
-    else if (expLen > 384) {
-        err = MP_READ_E;
-    }
-    else if (mp_count_bits(mod) != 3072) {
-        err = MP_READ_E;
-    }
-    else if (mp_iseven(mod)) {
-        err = MP_VAL;
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 134 * 4, NULL, DYNAMIC_TYPE_DH);
-        if (b == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        e = b + 134 * 2;
-        m = e + 134;
-        r = b;
-
-        sp_3072_from_mp(b, 134, base);
-        sp_3072_from_bin(e, 134, exp, expLen);
-        sp_3072_from_mp(m, 134, mod);
-
-    #ifdef HAVE_FFDHE_3072
-        if (base->used == 1 && base->dp[0] == 2 &&
-                ((m[133] << 3) | (m[132] >> 20)) == 0xffffL) {
-            err = sp_3072_mod_exp_2_134(r, e, expLen * 8, m);
-        }
-        else
-    #endif
-            err = sp_3072_mod_exp_134(r, b, e, expLen * 8, m, 0);
-    }
-
-    if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
-        *outLen = 384;
-        for (i=0; i<384 && out[i] == 0; i++) {
-        }
-        *outLen -= i;
-        XMEMMOVE(out, out + i, *outLen);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (b != NULL)
-#endif
-    {
-        /* only "e" is sensitive and needs zeroized */
-        if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 134U);
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        XFREE(b, NULL, DYNAMIC_TYPE_DH);
-    #endif
-    }
-    return err;
-#else
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* b = NULL;
-#else
-    sp_digit b[134 * 4];
+    sp_digit b[106 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -8323,7 +8015,7 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 134 * 4, NULL,
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL,
             DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
@@ -8331,29 +8023,29 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 134 * 2;
-        m = e + 134;
+        e = b + 106 * 2;
+        m = e + 106;
         r = b;
 
-        sp_3072_from_mp(b, 134, base);
-        sp_3072_from_bin(e, 134, exp, expLen);
-        sp_3072_from_mp(m, 134, mod);
+        sp_3072_from_mp(b, 106, base);
+        sp_3072_from_bin(e, 106, exp, expLen);
+        sp_3072_from_mp(m, 106, mod);
 
     #ifdef HAVE_FFDHE_3072
         if (base->used == 1 && base->dp[0] == 2U &&
-                ((m[133] << 3) | (m[132] >> 20)) == 0xffffL) {
-            err = sp_3072_mod_exp_2_134(r, e, expLen * 8U, m);
+                (m[105] >> 11) == 0xffffL) {
+            err = sp_3072_mod_exp_2_106(r, e, expLen * 8U, m);
         }
         else {
     #endif
-            err = sp_3072_mod_exp_134(r, b, e, expLen * 8U, m, 0);
+            err = sp_3072_mod_exp_106(r, b, e, expLen * 8U, m, 0);
     #ifdef HAVE_FFDHE_3072
         }
     #endif
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_106(r, out);
         *outLen = 384;
         for (i=0; i<384U && out[i] == 0U; i++) {
             /* Search for first non-zero. */
@@ -8368,14 +8060,13 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 134U);
+            ForceZero(e, sizeof(sp_digit) * 106U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
     }
 
     return err;
-#endif
 }
 #endif /* WOLFSSL_HAVE_SP_DH */
 
@@ -8396,7 +8087,7 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[67 * 4];
+    sp_digit b[53 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -8418,7 +8109,7 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 67 * 4, NULL,
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 53 * 4, NULL,
             DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
@@ -8426,19 +8117,19 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 67 * 2;
-        m = e + 67;
+        e = b + 53 * 2;
+        m = e + 53;
         r = b;
 
-        sp_3072_from_mp(b, 67, base);
-        sp_3072_from_mp(e, 67, exp);
-        sp_3072_from_mp(m, 67, mod);
+        sp_3072_from_mp(b, 53, base);
+        sp_3072_from_mp(e, 53, exp);
+        sp_3072_from_mp(m, 53, mod);
 
-        err = sp_3072_mod_exp_67(r, b, e, mp_count_bits(exp), m, 0);
+        err = sp_3072_mod_exp_53(r, b, e, mp_count_bits(exp), m, 0);
     }
 
     if (err == MP_OKAY) {
-        XMEMSET(r + 67, 0, sizeof(*r) * 67U);
+        XMEMSET(r + 53, 0, sizeof(*r) * 53U);
         err = sp_3072_to_mp(r, res);
     }
 
@@ -8448,7 +8139,7 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 134U);
+            ForceZero(e, sizeof(sp_digit) * 106U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -8458,7 +8149,7 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[67 * 4];
+    sp_digit b[53 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -8481,26 +8172,26 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 67 * 4, NULL, DYNAMIC_TYPE_DH);
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 53 * 4, NULL, DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
     }
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 67 * 2;
-        m = e + 67;
+        e = b + 53 * 2;
+        m = e + 53;
         r = b;
 
-        sp_3072_from_mp(b, 67, base);
-        sp_3072_from_mp(e, 67, exp);
-        sp_3072_from_mp(m, 67, mod);
+        sp_3072_from_mp(b, 53, base);
+        sp_3072_from_mp(e, 53, exp);
+        sp_3072_from_mp(m, 53, mod);
 
-        err = sp_3072_mod_exp_67(r, b, e, expBits, m, 0);
+        err = sp_3072_mod_exp_53(r, b, e, expBits, m, 0);
     }
 
     if (err == MP_OKAY) {
-        XMEMSET(r + 67, 0, sizeof(*r) * 67U);
+        XMEMSET(r + 53, 0, sizeof(*r) * 53U);
         err = sp_3072_to_mp(r, res);
     }
 
@@ -8511,7 +8202,7 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 134U);
+            ForceZero(e, sizeof(sp_digit) * 106U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -8523,9 +8214,7 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
 
-#endif /* !WOLFSSL_SP_NO_3072 */
-
-#ifdef WOLFSSL_SP_4096
+#else
 /* Read big endian unsigned byte array into r.
  *
  * r  A single precision integer.
@@ -8533,7 +8222,7 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
  * a  Byte array.
  * n  Number of bytes in array to read.
  */
-static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
+static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
 {
     int i;
     int j = 0;
@@ -8542,9 +8231,9 @@ static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
     r[0] = 0;
     for (i = n-1; i >= 0; i--) {
         r[j] |= (((sp_digit)a[i]) << s);
-        if (s >= 13U) {
-            r[j] &= 0x1fffff;
-            s = 21U - s;
+        if (s >= 20U) {
+            r[j] &= 0xfffffff;
+            s = 28U - s;
             if (j + 1 >= size) {
                 break;
             }
@@ -8567,9 +8256,9 @@ static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
  * size  Maximum number of bytes to convert
  * a  A multi-precision integer.
  */
-static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
+static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
 {
-#if DIGIT_BIT == 21
+#if DIGIT_BIT == 28
     int j;
 
     XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);
@@ -8577,7 +8266,7 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
     for (j = a->used; j < size; j++) {
         r[j] = 0;
     }
-#elif DIGIT_BIT > 21
+#elif DIGIT_BIT > 28
     int i;
     int j = 0;
     word32 s = 0;
@@ -8585,16 +8274,16 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
     r[0] = 0;
     for (i = 0; i < a->used && j < size; i++) {
         r[j] |= ((sp_digit)a->dp[i] << s);
-        r[j] &= 0x1fffff;
-        s = 21U - s;
+        r[j] &= 0xfffffff;
+        s = 28U - s;
         if (j + 1 >= size) {
             break;
         }
         /* lint allow cast of mismatch word32 and mp_digit */
         r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
-        while ((s + 21U) <= (word32)DIGIT_BIT) {
-            s += 21U;
-            r[j] &= 0x1fffff;
+        while ((s + 28U) <= (word32)DIGIT_BIT) {
+            s += 28U;
+            r[j] &= 0xfffffff;
             if (j + 1 >= size) {
                 break;
             }
@@ -8620,12 +8309,4325 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
     r[0] = 0;
     for (i = 0; i < a->used && j < size; i++) {
         r[j] |= ((sp_digit)a->dp[i]) << s;
-        if (s + DIGIT_BIT >= 21) {
-            r[j] &= 0x1fffff;
+        if (s + DIGIT_BIT >= 28) {
+            r[j] &= 0xfffffff;
             if (j + 1 >= size) {
                 break;
             }
-            s = 21 - s;
+            s = 28 - s;
+            if (s == DIGIT_BIT) {
+                r[++j] = 0;
+                s = 0;
+            }
+            else {
+                r[++j] = a->dp[i] >> s;
+                s = DIGIT_BIT - s;
+            }
+        }
+        else {
+            s += DIGIT_BIT;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#endif
+}
+
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 384
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_3072_to_bin_112(sp_digit* r, byte* a)
+{
+    int i;
+    int j;
+    int s = 0;
+    int b;
+
+    for (i=0; i<111; i++) {
+        r[i+1] += r[i] >> 28;
+        r[i] &= 0xfffffff;
+    }
+    j = 3072 / 8 - 1;
+    a[j] = 0;
+    for (i=0; i<112 && j>=0; i++) {
+        b = 0;
+        /* lint allow cast of mismatch sp_digit and int */
+        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
+        b += 8 - s;
+        if (j < 0) {
+            break;
+        }
+        while (b < 28) {
+            a[j--] = (byte)(r[i] >> b);
+            b += 8;
+            if (j < 0) {
+                break;
+            }
+        }
+        s = 8 - (b - 28);
+        if (j >= 0) {
+            a[j] = 0;
+        }
+        if (s != 0) {
+            j++;
+        }
+    }
+}
+
+/* Normalize the values in each word to 28 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_56(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 48; i += 8) {
+        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
+        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
+        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
+        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
+        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
+        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
+        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
+        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
+    }
+    a[49] += a[48] >> 28; a[48] &= 0xfffffff;
+    a[50] += a[49] >> 28; a[49] &= 0xfffffff;
+    a[51] += a[50] >> 28; a[50] &= 0xfffffff;
+    a[52] += a[51] >> 28; a[51] &= 0xfffffff;
+    a[53] += a[52] >> 28; a[52] &= 0xfffffff;
+    a[54] += a[53] >> 28; a[53] &= 0xfffffff;
+    a[55] += a[54] >> 28; a[54] &= 0xfffffff;
+}
+
+/* Normalize the values in each word to 28 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_55(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 48; i += 8) {
+        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
+        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
+        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
+        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
+        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
+        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
+        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
+        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
+    }
+    a[49] += a[48] >> 28; a[48] &= 0xfffffff;
+    a[50] += a[49] >> 28; a[49] &= 0xfffffff;
+    a[51] += a[50] >> 28; a[50] &= 0xfffffff;
+    a[52] += a[51] >> 28; a[51] &= 0xfffffff;
+    a[53] += a[52] >> 28; a[52] &= 0xfffffff;
+    a[54] += a[53] >> 28; a[53] &= 0xfffffff;
+}
+
+/* Normalize the values in each word to 28 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_112(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 104; i += 8) {
+        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
+        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
+        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
+        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
+        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
+        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
+        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
+        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
+    }
+    a[105] += a[104] >> 28; a[104] &= 0xfffffff;
+    a[106] += a[105] >> 28; a[105] &= 0xfffffff;
+    a[107] += a[106] >> 28; a[106] &= 0xfffffff;
+    a[108] += a[107] >> 28; a[107] &= 0xfffffff;
+    a[109] += a[108] >> 28; a[108] &= 0xfffffff;
+    a[110] += a[109] >> 28; a[109] &= 0xfffffff;
+    a[111] += a[110] >> 28; a[110] &= 0xfffffff;
+}
+
+/* Normalize the values in each word to 28 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_110(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 104; i += 8) {
+        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
+        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
+        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
+        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
+        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
+        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
+        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
+        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
+    }
+    a[105] += a[104] >> 28; a[104] &= 0xfffffff;
+    a[106] += a[105] >> 28; a[105] &= 0xfffffff;
+    a[107] += a[106] >> 28; a[106] &= 0xfffffff;
+    a[108] += a[107] >> 28; a[107] &= 0xfffffff;
+    a[109] += a[108] >> 28; a[108] &= 0xfffffff;
+}
+
+#ifndef WOLFSSL_SP_SMALL
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_mul_14(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_uint64 t0   = ((sp_uint64)a[ 0]) * b[ 0];
+    sp_uint64 t1   = ((sp_uint64)a[ 0]) * b[ 1]
+                 + ((sp_uint64)a[ 1]) * b[ 0];
+    sp_uint64 t2   = ((sp_uint64)a[ 0]) * b[ 2]
+                 + ((sp_uint64)a[ 1]) * b[ 1]
+                 + ((sp_uint64)a[ 2]) * b[ 0];
+    sp_uint64 t3   = ((sp_uint64)a[ 0]) * b[ 3]
+                 + ((sp_uint64)a[ 1]) * b[ 2]
+                 + ((sp_uint64)a[ 2]) * b[ 1]
+                 + ((sp_uint64)a[ 3]) * b[ 0];
+    sp_uint64 t4   = ((sp_uint64)a[ 0]) * b[ 4]
+                 + ((sp_uint64)a[ 1]) * b[ 3]
+                 + ((sp_uint64)a[ 2]) * b[ 2]
+                 + ((sp_uint64)a[ 3]) * b[ 1]
+                 + ((sp_uint64)a[ 4]) * b[ 0];
+    sp_uint64 t5   = ((sp_uint64)a[ 0]) * b[ 5]
+                 + ((sp_uint64)a[ 1]) * b[ 4]
+                 + ((sp_uint64)a[ 2]) * b[ 3]
+                 + ((sp_uint64)a[ 3]) * b[ 2]
+                 + ((sp_uint64)a[ 4]) * b[ 1]
+                 + ((sp_uint64)a[ 5]) * b[ 0];
+    sp_uint64 t6   = ((sp_uint64)a[ 0]) * b[ 6]
+                 + ((sp_uint64)a[ 1]) * b[ 5]
+                 + ((sp_uint64)a[ 2]) * b[ 4]
+                 + ((sp_uint64)a[ 3]) * b[ 3]
+                 + ((sp_uint64)a[ 4]) * b[ 2]
+                 + ((sp_uint64)a[ 5]) * b[ 1]
+                 + ((sp_uint64)a[ 6]) * b[ 0];
+    sp_uint64 t7   = ((sp_uint64)a[ 0]) * b[ 7]
+                 + ((sp_uint64)a[ 1]) * b[ 6]
+                 + ((sp_uint64)a[ 2]) * b[ 5]
+                 + ((sp_uint64)a[ 3]) * b[ 4]
+                 + ((sp_uint64)a[ 4]) * b[ 3]
+                 + ((sp_uint64)a[ 5]) * b[ 2]
+                 + ((sp_uint64)a[ 6]) * b[ 1]
+                 + ((sp_uint64)a[ 7]) * b[ 0];
+    sp_uint64 t8   = ((sp_uint64)a[ 0]) * b[ 8]
+                 + ((sp_uint64)a[ 1]) * b[ 7]
+                 + ((sp_uint64)a[ 2]) * b[ 6]
+                 + ((sp_uint64)a[ 3]) * b[ 5]
+                 + ((sp_uint64)a[ 4]) * b[ 4]
+                 + ((sp_uint64)a[ 5]) * b[ 3]
+                 + ((sp_uint64)a[ 6]) * b[ 2]
+                 + ((sp_uint64)a[ 7]) * b[ 1]
+                 + ((sp_uint64)a[ 8]) * b[ 0];
+    sp_uint64 t9   = ((sp_uint64)a[ 0]) * b[ 9]
+                 + ((sp_uint64)a[ 1]) * b[ 8]
+                 + ((sp_uint64)a[ 2]) * b[ 7]
+                 + ((sp_uint64)a[ 3]) * b[ 6]
+                 + ((sp_uint64)a[ 4]) * b[ 5]
+                 + ((sp_uint64)a[ 5]) * b[ 4]
+                 + ((sp_uint64)a[ 6]) * b[ 3]
+                 + ((sp_uint64)a[ 7]) * b[ 2]
+                 + ((sp_uint64)a[ 8]) * b[ 1]
+                 + ((sp_uint64)a[ 9]) * b[ 0];
+    sp_uint64 t10  = ((sp_uint64)a[ 0]) * b[10]
+                 + ((sp_uint64)a[ 1]) * b[ 9]
+                 + ((sp_uint64)a[ 2]) * b[ 8]
+                 + ((sp_uint64)a[ 3]) * b[ 7]
+                 + ((sp_uint64)a[ 4]) * b[ 6]
+                 + ((sp_uint64)a[ 5]) * b[ 5]
+                 + ((sp_uint64)a[ 6]) * b[ 4]
+                 + ((sp_uint64)a[ 7]) * b[ 3]
+                 + ((sp_uint64)a[ 8]) * b[ 2]
+                 + ((sp_uint64)a[ 9]) * b[ 1]
+                 + ((sp_uint64)a[10]) * b[ 0];
+    sp_uint64 t11  = ((sp_uint64)a[ 0]) * b[11]
+                 + ((sp_uint64)a[ 1]) * b[10]
+                 + ((sp_uint64)a[ 2]) * b[ 9]
+                 + ((sp_uint64)a[ 3]) * b[ 8]
+                 + ((sp_uint64)a[ 4]) * b[ 7]
+                 + ((sp_uint64)a[ 5]) * b[ 6]
+                 + ((sp_uint64)a[ 6]) * b[ 5]
+                 + ((sp_uint64)a[ 7]) * b[ 4]
+                 + ((sp_uint64)a[ 8]) * b[ 3]
+                 + ((sp_uint64)a[ 9]) * b[ 2]
+                 + ((sp_uint64)a[10]) * b[ 1]
+                 + ((sp_uint64)a[11]) * b[ 0];
+    sp_uint64 t12  = ((sp_uint64)a[ 0]) * b[12]
+                 + ((sp_uint64)a[ 1]) * b[11]
+                 + ((sp_uint64)a[ 2]) * b[10]
+                 + ((sp_uint64)a[ 3]) * b[ 9]
+                 + ((sp_uint64)a[ 4]) * b[ 8]
+                 + ((sp_uint64)a[ 5]) * b[ 7]
+                 + ((sp_uint64)a[ 6]) * b[ 6]
+                 + ((sp_uint64)a[ 7]) * b[ 5]
+                 + ((sp_uint64)a[ 8]) * b[ 4]
+                 + ((sp_uint64)a[ 9]) * b[ 3]
+                 + ((sp_uint64)a[10]) * b[ 2]
+                 + ((sp_uint64)a[11]) * b[ 1]
+                 + ((sp_uint64)a[12]) * b[ 0];
+    sp_uint64 t13  = ((sp_uint64)a[ 0]) * b[13]
+                 + ((sp_uint64)a[ 1]) * b[12]
+                 + ((sp_uint64)a[ 2]) * b[11]
+                 + ((sp_uint64)a[ 3]) * b[10]
+                 + ((sp_uint64)a[ 4]) * b[ 9]
+                 + ((sp_uint64)a[ 5]) * b[ 8]
+                 + ((sp_uint64)a[ 6]) * b[ 7]
+                 + ((sp_uint64)a[ 7]) * b[ 6]
+                 + ((sp_uint64)a[ 8]) * b[ 5]
+                 + ((sp_uint64)a[ 9]) * b[ 4]
+                 + ((sp_uint64)a[10]) * b[ 3]
+                 + ((sp_uint64)a[11]) * b[ 2]
+                 + ((sp_uint64)a[12]) * b[ 1]
+                 + ((sp_uint64)a[13]) * b[ 0];
+    sp_uint64 t14  = ((sp_uint64)a[ 1]) * b[13]
+                 + ((sp_uint64)a[ 2]) * b[12]
+                 + ((sp_uint64)a[ 3]) * b[11]
+                 + ((sp_uint64)a[ 4]) * b[10]
+                 + ((sp_uint64)a[ 5]) * b[ 9]
+                 + ((sp_uint64)a[ 6]) * b[ 8]
+                 + ((sp_uint64)a[ 7]) * b[ 7]
+                 + ((sp_uint64)a[ 8]) * b[ 6]
+                 + ((sp_uint64)a[ 9]) * b[ 5]
+                 + ((sp_uint64)a[10]) * b[ 4]
+                 + ((sp_uint64)a[11]) * b[ 3]
+                 + ((sp_uint64)a[12]) * b[ 2]
+                 + ((sp_uint64)a[13]) * b[ 1];
+    sp_uint64 t15  = ((sp_uint64)a[ 2]) * b[13]
+                 + ((sp_uint64)a[ 3]) * b[12]
+                 + ((sp_uint64)a[ 4]) * b[11]
+                 + ((sp_uint64)a[ 5]) * b[10]
+                 + ((sp_uint64)a[ 6]) * b[ 9]
+                 + ((sp_uint64)a[ 7]) * b[ 8]
+                 + ((sp_uint64)a[ 8]) * b[ 7]
+                 + ((sp_uint64)a[ 9]) * b[ 6]
+                 + ((sp_uint64)a[10]) * b[ 5]
+                 + ((sp_uint64)a[11]) * b[ 4]
+                 + ((sp_uint64)a[12]) * b[ 3]
+                 + ((sp_uint64)a[13]) * b[ 2];
+    sp_uint64 t16  = ((sp_uint64)a[ 3]) * b[13]
+                 + ((sp_uint64)a[ 4]) * b[12]
+                 + ((sp_uint64)a[ 5]) * b[11]
+                 + ((sp_uint64)a[ 6]) * b[10]
+                 + ((sp_uint64)a[ 7]) * b[ 9]
+                 + ((sp_uint64)a[ 8]) * b[ 8]
+                 + ((sp_uint64)a[ 9]) * b[ 7]
+                 + ((sp_uint64)a[10]) * b[ 6]
+                 + ((sp_uint64)a[11]) * b[ 5]
+                 + ((sp_uint64)a[12]) * b[ 4]
+                 + ((sp_uint64)a[13]) * b[ 3];
+    sp_uint64 t17  = ((sp_uint64)a[ 4]) * b[13]
+                 + ((sp_uint64)a[ 5]) * b[12]
+                 + ((sp_uint64)a[ 6]) * b[11]
+                 + ((sp_uint64)a[ 7]) * b[10]
+                 + ((sp_uint64)a[ 8]) * b[ 9]
+                 + ((sp_uint64)a[ 9]) * b[ 8]
+                 + ((sp_uint64)a[10]) * b[ 7]
+                 + ((sp_uint64)a[11]) * b[ 6]
+                 + ((sp_uint64)a[12]) * b[ 5]
+                 + ((sp_uint64)a[13]) * b[ 4];
+    sp_uint64 t18  = ((sp_uint64)a[ 5]) * b[13]
+                 + ((sp_uint64)a[ 6]) * b[12]
+                 + ((sp_uint64)a[ 7]) * b[11]
+                 + ((sp_uint64)a[ 8]) * b[10]
+                 + ((sp_uint64)a[ 9]) * b[ 9]
+                 + ((sp_uint64)a[10]) * b[ 8]
+                 + ((sp_uint64)a[11]) * b[ 7]
+                 + ((sp_uint64)a[12]) * b[ 6]
+                 + ((sp_uint64)a[13]) * b[ 5];
+    sp_uint64 t19  = ((sp_uint64)a[ 6]) * b[13]
+                 + ((sp_uint64)a[ 7]) * b[12]
+                 + ((sp_uint64)a[ 8]) * b[11]
+                 + ((sp_uint64)a[ 9]) * b[10]
+                 + ((sp_uint64)a[10]) * b[ 9]
+                 + ((sp_uint64)a[11]) * b[ 8]
+                 + ((sp_uint64)a[12]) * b[ 7]
+                 + ((sp_uint64)a[13]) * b[ 6];
+    sp_uint64 t20  = ((sp_uint64)a[ 7]) * b[13]
+                 + ((sp_uint64)a[ 8]) * b[12]
+                 + ((sp_uint64)a[ 9]) * b[11]
+                 + ((sp_uint64)a[10]) * b[10]
+                 + ((sp_uint64)a[11]) * b[ 9]
+                 + ((sp_uint64)a[12]) * b[ 8]
+                 + ((sp_uint64)a[13]) * b[ 7];
+    sp_uint64 t21  = ((sp_uint64)a[ 8]) * b[13]
+                 + ((sp_uint64)a[ 9]) * b[12]
+                 + ((sp_uint64)a[10]) * b[11]
+                 + ((sp_uint64)a[11]) * b[10]
+                 + ((sp_uint64)a[12]) * b[ 9]
+                 + ((sp_uint64)a[13]) * b[ 8];
+    sp_uint64 t22  = ((sp_uint64)a[ 9]) * b[13]
+                 + ((sp_uint64)a[10]) * b[12]
+                 + ((sp_uint64)a[11]) * b[11]
+                 + ((sp_uint64)a[12]) * b[10]
+                 + ((sp_uint64)a[13]) * b[ 9];
+    sp_uint64 t23  = ((sp_uint64)a[10]) * b[13]
+                 + ((sp_uint64)a[11]) * b[12]
+                 + ((sp_uint64)a[12]) * b[11]
+                 + ((sp_uint64)a[13]) * b[10];
+    sp_uint64 t24  = ((sp_uint64)a[11]) * b[13]
+                 + ((sp_uint64)a[12]) * b[12]
+                 + ((sp_uint64)a[13]) * b[11];
+    sp_uint64 t25  = ((sp_uint64)a[12]) * b[13]
+                 + ((sp_uint64)a[13]) * b[12];
+    sp_uint64 t26  = ((sp_uint64)a[13]) * b[13];
+
+    t1   += t0  >> 28; r[ 0] = t0  & 0xfffffff;
+    t2   += t1  >> 28; r[ 1] = t1  & 0xfffffff;
+    t3   += t2  >> 28; r[ 2] = t2  & 0xfffffff;
+    t4   += t3  >> 28; r[ 3] = t3  & 0xfffffff;
+    t5   += t4  >> 28; r[ 4] = t4  & 0xfffffff;
+    t6   += t5  >> 28; r[ 5] = t5  & 0xfffffff;
+    t7   += t6  >> 28; r[ 6] = t6  & 0xfffffff;
+    t8   += t7  >> 28; r[ 7] = t7  & 0xfffffff;
+    t9   += t8  >> 28; r[ 8] = t8  & 0xfffffff;
+    t10  += t9  >> 28; r[ 9] = t9  & 0xfffffff;
+    t11  += t10 >> 28; r[10] = t10 & 0xfffffff;
+    t12  += t11 >> 28; r[11] = t11 & 0xfffffff;
+    t13  += t12 >> 28; r[12] = t12 & 0xfffffff;
+    t14  += t13 >> 28; r[13] = t13 & 0xfffffff;
+    t15  += t14 >> 28; r[14] = t14 & 0xfffffff;
+    t16  += t15 >> 28; r[15] = t15 & 0xfffffff;
+    t17  += t16 >> 28; r[16] = t16 & 0xfffffff;
+    t18  += t17 >> 28; r[17] = t17 & 0xfffffff;
+    t19  += t18 >> 28; r[18] = t18 & 0xfffffff;
+    t20  += t19 >> 28; r[19] = t19 & 0xfffffff;
+    t21  += t20 >> 28; r[20] = t20 & 0xfffffff;
+    t22  += t21 >> 28; r[21] = t21 & 0xfffffff;
+    t23  += t22 >> 28; r[22] = t22 & 0xfffffff;
+    t24  += t23 >> 28; r[23] = t23 & 0xfffffff;
+    t25  += t24 >> 28; r[24] = t24 & 0xfffffff;
+    t26  += t25 >> 28; r[25] = t25 & 0xfffffff;
+    r[27] = (sp_digit)(t26 >> 28);
+                       r[26] = t26 & 0xfffffff;
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_sqr_14(sp_digit* r, const sp_digit* a)
+{
+    sp_uint64 t0   =  ((sp_uint64)a[ 0]) * a[ 0];
+    sp_uint64 t1   = (((sp_uint64)a[ 0]) * a[ 1]) * 2;
+    sp_uint64 t2   = (((sp_uint64)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_uint64)a[ 1]) * a[ 1];
+    sp_uint64 t3   = (((sp_uint64)a[ 0]) * a[ 3]
+                 +  ((sp_uint64)a[ 1]) * a[ 2]) * 2;
+    sp_uint64 t4   = (((sp_uint64)a[ 0]) * a[ 4]
+                 +  ((sp_uint64)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_uint64)a[ 2]) * a[ 2];
+    sp_uint64 t5   = (((sp_uint64)a[ 0]) * a[ 5]
+                 +  ((sp_uint64)a[ 1]) * a[ 4]
+                 +  ((sp_uint64)a[ 2]) * a[ 3]) * 2;
+    sp_uint64 t6   = (((sp_uint64)a[ 0]) * a[ 6]
+                 +  ((sp_uint64)a[ 1]) * a[ 5]
+                 +  ((sp_uint64)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_uint64)a[ 3]) * a[ 3];
+    sp_uint64 t7   = (((sp_uint64)a[ 0]) * a[ 7]
+                 +  ((sp_uint64)a[ 1]) * a[ 6]
+                 +  ((sp_uint64)a[ 2]) * a[ 5]
+                 +  ((sp_uint64)a[ 3]) * a[ 4]) * 2;
+    sp_uint64 t8   = (((sp_uint64)a[ 0]) * a[ 8]
+                 +  ((sp_uint64)a[ 1]) * a[ 7]
+                 +  ((sp_uint64)a[ 2]) * a[ 6]
+                 +  ((sp_uint64)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_uint64)a[ 4]) * a[ 4];
+    sp_uint64 t9   = (((sp_uint64)a[ 0]) * a[ 9]
+                 +  ((sp_uint64)a[ 1]) * a[ 8]
+                 +  ((sp_uint64)a[ 2]) * a[ 7]
+                 +  ((sp_uint64)a[ 3]) * a[ 6]
+                 +  ((sp_uint64)a[ 4]) * a[ 5]) * 2;
+    sp_uint64 t10  = (((sp_uint64)a[ 0]) * a[10]
+                 +  ((sp_uint64)a[ 1]) * a[ 9]
+                 +  ((sp_uint64)a[ 2]) * a[ 8]
+                 +  ((sp_uint64)a[ 3]) * a[ 7]
+                 +  ((sp_uint64)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_uint64)a[ 5]) * a[ 5];
+    sp_uint64 t11  = (((sp_uint64)a[ 0]) * a[11]
+                 +  ((sp_uint64)a[ 1]) * a[10]
+                 +  ((sp_uint64)a[ 2]) * a[ 9]
+                 +  ((sp_uint64)a[ 3]) * a[ 8]
+                 +  ((sp_uint64)a[ 4]) * a[ 7]
+                 +  ((sp_uint64)a[ 5]) * a[ 6]) * 2;
+    sp_uint64 t12  = (((sp_uint64)a[ 0]) * a[12]
+                 +  ((sp_uint64)a[ 1]) * a[11]
+                 +  ((sp_uint64)a[ 2]) * a[10]
+                 +  ((sp_uint64)a[ 3]) * a[ 9]
+                 +  ((sp_uint64)a[ 4]) * a[ 8]
+                 +  ((sp_uint64)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_uint64)a[ 6]) * a[ 6];
+    sp_uint64 t13  = (((sp_uint64)a[ 0]) * a[13]
+                 +  ((sp_uint64)a[ 1]) * a[12]
+                 +  ((sp_uint64)a[ 2]) * a[11]
+                 +  ((sp_uint64)a[ 3]) * a[10]
+                 +  ((sp_uint64)a[ 4]) * a[ 9]
+                 +  ((sp_uint64)a[ 5]) * a[ 8]
+                 +  ((sp_uint64)a[ 6]) * a[ 7]) * 2;
+    sp_uint64 t14  = (((sp_uint64)a[ 1]) * a[13]
+                 +  ((sp_uint64)a[ 2]) * a[12]
+                 +  ((sp_uint64)a[ 3]) * a[11]
+                 +  ((sp_uint64)a[ 4]) * a[10]
+                 +  ((sp_uint64)a[ 5]) * a[ 9]
+                 +  ((sp_uint64)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_uint64)a[ 7]) * a[ 7];
+    sp_uint64 t15  = (((sp_uint64)a[ 2]) * a[13]
+                 +  ((sp_uint64)a[ 3]) * a[12]
+                 +  ((sp_uint64)a[ 4]) * a[11]
+                 +  ((sp_uint64)a[ 5]) * a[10]
+                 +  ((sp_uint64)a[ 6]) * a[ 9]
+                 +  ((sp_uint64)a[ 7]) * a[ 8]) * 2;
+    sp_uint64 t16  = (((sp_uint64)a[ 3]) * a[13]
+                 +  ((sp_uint64)a[ 4]) * a[12]
+                 +  ((sp_uint64)a[ 5]) * a[11]
+                 +  ((sp_uint64)a[ 6]) * a[10]
+                 +  ((sp_uint64)a[ 7]) * a[ 9]) * 2
+                 +  ((sp_uint64)a[ 8]) * a[ 8];
+    sp_uint64 t17  = (((sp_uint64)a[ 4]) * a[13]
+                 +  ((sp_uint64)a[ 5]) * a[12]
+                 +  ((sp_uint64)a[ 6]) * a[11]
+                 +  ((sp_uint64)a[ 7]) * a[10]
+                 +  ((sp_uint64)a[ 8]) * a[ 9]) * 2;
+    sp_uint64 t18  = (((sp_uint64)a[ 5]) * a[13]
+                 +  ((sp_uint64)a[ 6]) * a[12]
+                 +  ((sp_uint64)a[ 7]) * a[11]
+                 +  ((sp_uint64)a[ 8]) * a[10]) * 2
+                 +  ((sp_uint64)a[ 9]) * a[ 9];
+    sp_uint64 t19  = (((sp_uint64)a[ 6]) * a[13]
+                 +  ((sp_uint64)a[ 7]) * a[12]
+                 +  ((sp_uint64)a[ 8]) * a[11]
+                 +  ((sp_uint64)a[ 9]) * a[10]) * 2;
+    sp_uint64 t20  = (((sp_uint64)a[ 7]) * a[13]
+                 +  ((sp_uint64)a[ 8]) * a[12]
+                 +  ((sp_uint64)a[ 9]) * a[11]) * 2
+                 +  ((sp_uint64)a[10]) * a[10];
+    sp_uint64 t21  = (((sp_uint64)a[ 8]) * a[13]
+                 +  ((sp_uint64)a[ 9]) * a[12]
+                 +  ((sp_uint64)a[10]) * a[11]) * 2;
+    sp_uint64 t22  = (((sp_uint64)a[ 9]) * a[13]
+                 +  ((sp_uint64)a[10]) * a[12]) * 2
+                 +  ((sp_uint64)a[11]) * a[11];
+    sp_uint64 t23  = (((sp_uint64)a[10]) * a[13]
+                 +  ((sp_uint64)a[11]) * a[12]) * 2;
+    sp_uint64 t24  = (((sp_uint64)a[11]) * a[13]) * 2
+                 +  ((sp_uint64)a[12]) * a[12];
+    sp_uint64 t25  = (((sp_uint64)a[12]) * a[13]) * 2;
+    sp_uint64 t26  =  ((sp_uint64)a[13]) * a[13];
+
+    t1   += t0  >> 28; r[ 0] = t0  & 0xfffffff;
+    t2   += t1  >> 28; r[ 1] = t1  & 0xfffffff;
+    t3   += t2  >> 28; r[ 2] = t2  & 0xfffffff;
+    t4   += t3  >> 28; r[ 3] = t3  & 0xfffffff;
+    t5   += t4  >> 28; r[ 4] = t4  & 0xfffffff;
+    t6   += t5  >> 28; r[ 5] = t5  & 0xfffffff;
+    t7   += t6  >> 28; r[ 6] = t6  & 0xfffffff;
+    t8   += t7  >> 28; r[ 7] = t7  & 0xfffffff;
+    t9   += t8  >> 28; r[ 8] = t8  & 0xfffffff;
+    t10  += t9  >> 28; r[ 9] = t9  & 0xfffffff;
+    t11  += t10 >> 28; r[10] = t10 & 0xfffffff;
+    t12  += t11 >> 28; r[11] = t11 & 0xfffffff;
+    t13  += t12 >> 28; r[12] = t12 & 0xfffffff;
+    t14  += t13 >> 28; r[13] = t13 & 0xfffffff;
+    t15  += t14 >> 28; r[14] = t14 & 0xfffffff;
+    t16  += t15 >> 28; r[15] = t15 & 0xfffffff;
+    t17  += t16 >> 28; r[16] = t16 & 0xfffffff;
+    t18  += t17 >> 28; r[17] = t17 & 0xfffffff;
+    t19  += t18 >> 28; r[18] = t18 & 0xfffffff;
+    t20  += t19 >> 28; r[19] = t19 & 0xfffffff;
+    t21  += t20 >> 28; r[20] = t20 & 0xfffffff;
+    t22  += t21 >> 28; r[21] = t21 & 0xfffffff;
+    t23  += t22 >> 28; r[22] = t22 & 0xfffffff;
+    t24  += t23 >> 28; r[23] = t23 & 0xfffffff;
+    t25  += t24 >> 28; r[24] = t24 & 0xfffffff;
+    t26  += t25 >> 28; r[25] = t25 & 0xfffffff;
+    r[27] = (sp_digit)(t26 >> 28);
+                       r[26] = t26 & 0xfffffff;
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_14(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    r[ 0] = a[ 0] + b[ 0];
+    r[ 1] = a[ 1] + b[ 1];
+    r[ 2] = a[ 2] + b[ 2];
+    r[ 3] = a[ 3] + b[ 3];
+    r[ 4] = a[ 4] + b[ 4];
+    r[ 5] = a[ 5] + b[ 5];
+    r[ 6] = a[ 6] + b[ 6];
+    r[ 7] = a[ 7] + b[ 7];
+    r[ 8] = a[ 8] + b[ 8];
+    r[ 9] = a[ 9] + b[ 9];
+    r[10] = a[10] + b[10];
+    r[11] = a[11] + b[11];
+    r[12] = a[12] + b[12];
+    r[13] = a[13] + b[13];
+
+    return 0;
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_28(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 24; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
+    }
+    r[24] = a[24] + b[24];
+    r[25] = a[25] + b[25];
+    r[26] = a[26] + b[26];
+    r[27] = a[27] + b[27];
+
+    return 0;
+}
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_28(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 24; i += 8) {
+        r[i + 0] = a[i + 0] - b[i + 0];
+        r[i + 1] = a[i + 1] - b[i + 1];
+        r[i + 2] = a[i + 2] - b[i + 2];
+        r[i + 3] = a[i + 3] - b[i + 3];
+        r[i + 4] = a[i + 4] - b[i + 4];
+        r[i + 5] = a[i + 5] - b[i + 5];
+        r[i + 6] = a[i + 6] - b[i + 6];
+        r[i + 7] = a[i + 7] - b[i + 7];
+    }
+    r[24] = a[24] - b[24];
+    r[25] = a[25] - b[25];
+    r[26] = a[26] - b[26];
+    r[27] = a[27] - b[27];
+
+    return 0;
+}
+
+/* Normalize the values in each word to 28 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_14(sp_digit* a)
+{
+    a[1] += a[0] >> 28; a[0] &= 0xfffffff;
+    a[2] += a[1] >> 28; a[1] &= 0xfffffff;
+    a[3] += a[2] >> 28; a[2] &= 0xfffffff;
+    a[4] += a[3] >> 28; a[3] &= 0xfffffff;
+    a[5] += a[4] >> 28; a[4] &= 0xfffffff;
+    a[6] += a[5] >> 28; a[5] &= 0xfffffff;
+    a[7] += a[6] >> 28; a[6] &= 0xfffffff;
+    a[8] += a[7] >> 28; a[7] &= 0xfffffff;
+    a[9] += a[8] >> 28; a[8] &= 0xfffffff;
+    a[10] += a[9] >> 28; a[9] &= 0xfffffff;
+    a[11] += a[10] >> 28; a[10] &= 0xfffffff;
+    a[12] += a[11] >> 28; a[11] &= 0xfffffff;
+    a[13] += a[12] >> 28; a[12] &= 0xfffffff;
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_mul_28(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_digit* z0 = r;
+    sp_digit z1[28];
+    sp_digit* a1 = z1;
+    sp_digit b1[14];
+    sp_digit* z2 = r + 28;
+    (void)sp_3072_add_14(a1, a, &a[14]);
+    sp_3072_norm_14(a1);
+    (void)sp_3072_add_14(b1, b, &b[14]);
+    sp_3072_norm_14(b1);
+    sp_3072_mul_14(z2, &a[14], &b[14]);
+    sp_3072_mul_14(z0, a, b);
+    sp_3072_mul_14(z1, a1, b1);
+    (void)sp_3072_sub_28(z1, z1, z2);
+    (void)sp_3072_sub_28(z1, z1, z0);
+    (void)sp_3072_add_28(r + 14, r + 14, z1);
+    sp_3072_norm_56(r);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_sqr_28(sp_digit* r, const sp_digit* a)
+{
+    sp_digit* z0 = r;
+    sp_digit z1[28];
+    sp_digit* a1 = z1;
+    sp_digit* z2 = r + 28;
+    (void)sp_3072_add_14(a1, a, &a[14]);
+    sp_3072_norm_14(a1);
+    sp_3072_sqr_14(z2, &a[14]);
+    sp_3072_sqr_14(z0, a);
+    sp_3072_sqr_14(z1, a1);
+    (void)sp_3072_sub_28(z1, z1, z2);
+    (void)sp_3072_sub_28(z1, z1, z0);
+    (void)sp_3072_add_28(r + 14, r + 14, z1);
+    sp_3072_norm_56(r);
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_56(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 56; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
+    }
+
+    return 0;
+}
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_56(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 56; i += 8) {
+        r[i + 0] = a[i + 0] - b[i + 0];
+        r[i + 1] = a[i + 1] - b[i + 1];
+        r[i + 2] = a[i + 2] - b[i + 2];
+        r[i + 3] = a[i + 3] - b[i + 3];
+        r[i + 4] = a[i + 4] - b[i + 4];
+        r[i + 5] = a[i + 5] - b[i + 5];
+        r[i + 6] = a[i + 6] - b[i + 6];
+        r[i + 7] = a[i + 7] - b[i + 7];
+    }
+
+    return 0;
+}
+
+/* Normalize the values in each word to 28 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_28(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 24; i += 8) {
+        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
+        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
+        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
+        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
+        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
+        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
+        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
+        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
+    }
+    a[25] += a[24] >> 28; a[24] &= 0xfffffff;
+    a[26] += a[25] >> 28; a[25] &= 0xfffffff;
+    a[27] += a[26] >> 28; a[26] &= 0xfffffff;
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_mul_56(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_digit* z0 = r;
+    sp_digit z1[56];
+    sp_digit* a1 = z1;
+    sp_digit b1[28];
+    sp_digit* z2 = r + 56;
+    (void)sp_3072_add_28(a1, a, &a[28]);
+    sp_3072_norm_28(a1);
+    (void)sp_3072_add_28(b1, b, &b[28]);
+    sp_3072_norm_28(b1);
+    sp_3072_mul_28(z2, &a[28], &b[28]);
+    sp_3072_mul_28(z0, a, b);
+    sp_3072_mul_28(z1, a1, b1);
+    (void)sp_3072_sub_56(z1, z1, z2);
+    (void)sp_3072_sub_56(z1, z1, z0);
+    (void)sp_3072_add_56(r + 28, r + 28, z1);
+    sp_3072_norm_112(r);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_sqr_56(sp_digit* r, const sp_digit* a)
+{
+    sp_digit* z0 = r;
+    sp_digit z1[56];
+    sp_digit* a1 = z1;
+    sp_digit* z2 = r + 56;
+    (void)sp_3072_add_28(a1, a, &a[28]);
+    sp_3072_norm_28(a1);
+    sp_3072_sqr_28(z2, &a[28]);
+    sp_3072_sqr_28(z0, a);
+    sp_3072_sqr_28(z1, a1);
+    (void)sp_3072_sub_56(z1, z1, z2);
+    (void)sp_3072_sub_56(z1, z1, z0);
+    (void)sp_3072_add_56(r + 28, r + 28, z1);
+    sp_3072_norm_112(r);
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_112(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 112; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
+    }
+
+    return 0;
+}
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_112(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 112; i += 8) {
+        r[i + 0] = a[i + 0] - b[i + 0];
+        r[i + 1] = a[i + 1] - b[i + 1];
+        r[i + 2] = a[i + 2] - b[i + 2];
+        r[i + 3] = a[i + 3] - b[i + 3];
+        r[i + 4] = a[i + 4] - b[i + 4];
+        r[i + 5] = a[i + 5] - b[i + 5];
+        r[i + 6] = a[i + 6] - b[i + 6];
+        r[i + 7] = a[i + 7] - b[i + 7];
+    }
+
+    return 0;
+}
+
+/* Normalize the values in each word to 28 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_224(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 216; i += 8) {
+        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
+        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
+        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
+        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
+        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
+        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
+        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
+        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
+    }
+    a[217] += a[216] >> 28; a[216] &= 0xfffffff;
+    a[218] += a[217] >> 28; a[217] &= 0xfffffff;
+    a[219] += a[218] >> 28; a[218] &= 0xfffffff;
+    a[220] += a[219] >> 28; a[219] &= 0xfffffff;
+    a[221] += a[220] >> 28; a[220] &= 0xfffffff;
+    a[222] += a[221] >> 28; a[221] &= 0xfffffff;
+    a[223] += a[222] >> 28; a[222] &= 0xfffffff;
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_mul_112(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_digit* z0 = r;
+    sp_digit z1[112];
+    sp_digit* a1 = z1;
+    sp_digit b1[56];
+    sp_digit* z2 = r + 112;
+    (void)sp_3072_add_56(a1, a, &a[56]);
+    sp_3072_norm_56(a1);
+    (void)sp_3072_add_56(b1, b, &b[56]);
+    sp_3072_norm_56(b1);
+    sp_3072_mul_56(z2, &a[56], &b[56]);
+    sp_3072_mul_56(z0, a, b);
+    sp_3072_mul_56(z1, a1, b1);
+    (void)sp_3072_sub_112(z1, z1, z2);
+    (void)sp_3072_sub_112(z1, z1, z0);
+    (void)sp_3072_add_112(r + 56, r + 56, z1);
+    sp_3072_norm_224(r);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_sqr_112(sp_digit* r, const sp_digit* a)
+{
+    sp_digit* z0 = r;
+    sp_digit z1[112];
+    sp_digit* a1 = z1;
+    sp_digit* z2 = r + 112;
+    (void)sp_3072_add_56(a1, a, &a[56]);
+    sp_3072_norm_56(a1);
+    sp_3072_sqr_56(z2, &a[56]);
+    sp_3072_sqr_56(z0, a);
+    sp_3072_sqr_56(z1, a1);
+    (void)sp_3072_sub_112(z1, z1, z2);
+    (void)sp_3072_sub_112(z1, z1, z0);
+    (void)sp_3072_add_112(r + 56, r + 56, z1);
+    sp_3072_norm_224(r);
+}
+
+#endif /* !WOLFSSL_SP_SMALL */
+/* Caclulate the bottom digit of -1/a mod 2^n.
+ *
+ * a    A single precision number.
+ * rho  Bottom word of inverse.
+ */
+static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
+{
+    sp_digit x;
+    sp_digit b;
+
+    b = a[0];
+    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+    x &= 0xfffffff;
+
+    /* rho = -1/m mod b */
+    *rho = ((sp_digit)1 << 28) - x;
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_d_112(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    sp_digit t2;
+    sp_int64 p[4];
+    int i;
+
+    for (i = 0; i < 112; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 3] = (sp_digit)t2;
+    }
+    r[112] = (sp_digit)(t & 0xfffffff);
+}
+
+#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 3072 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_3072_mont_norm_56(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i = 0; i < 48; i += 8) {
+        r[i + 0] = 0xfffffff;
+        r[i + 1] = 0xfffffff;
+        r[i + 2] = 0xfffffff;
+        r[i + 3] = 0xfffffff;
+        r[i + 4] = 0xfffffff;
+        r[i + 5] = 0xfffffff;
+        r[i + 6] = 0xfffffff;
+        r[i + 7] = 0xfffffff;
+    }
+    r[48] = 0xfffffff;
+    r[49] = 0xfffffff;
+    r[50] = 0xfffffff;
+    r[51] = 0xfffffff;
+    r[52] = 0xfffffff;
+    r[53] = 0xfffffff;
+    r[54] = 0xffffffL;
+    r[55] = 0;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_3072_sub_56(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_3072_cmp_56(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i = 48; i >= 0; i -= 8) {
+        r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 4] - b[i + 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 3] - b[i + 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 2] - b[i + 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_sub_56(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 56; i += 8) {
+        r[i + 0] = a[i + 0] - (b[i + 0] & m);
+        r[i + 1] = a[i + 1] - (b[i + 1] & m);
+        r[i + 2] = a[i + 2] - (b[i + 2] & m);
+        r[i + 3] = a[i + 3] - (b[i + 3] & m);
+        r[i + 4] = a[i + 4] - (b[i + 4] & m);
+        r[i + 5] = a[i + 5] - (b[i + 5] & m);
+        r[i + 6] = a[i + 6] - (b[i + 6] & m);
+        r[i + 7] = a[i + 7] - (b[i + 7] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_add_56(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 56; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0xfffffff;
+        t >>= 28;
+    }
+    r[56] += (sp_digit)t;
+#else
+    sp_int64 tb = b;
+    sp_int64 t[8];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 48; i += 8) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        t[4]  = (tb * a[i+4]) + r[i+4];
+        t[5]  = (tb * a[i+5]) + r[i+5];
+        t[6]  = (tb * a[i+6]) + r[i+6];
+        t[7]  = (tb * a[i+7]) + r[i+7];
+        r[i+0] = t[0] & 0xfffffff;
+        t[1] += t[0] >> 28;
+        r[i+1] = t[1] & 0xfffffff;
+        t[2] += t[1] >> 28;
+        r[i+2] = t[2] & 0xfffffff;
+        t[3] += t[2] >> 28;
+        r[i+3] = t[3] & 0xfffffff;
+        t[4] += t[3] >> 28;
+        r[i+4] = t[4] & 0xfffffff;
+        t[5] += t[4] >> 28;
+        r[i+5] = t[5] & 0xfffffff;
+        t[6] += t[5] >> 28;
+        r[i+6] = t[6] & 0xfffffff;
+        t[7] += t[6] >> 28;
+        r[i+7] = t[7] & 0xfffffff;
+        t[0]  = t[7] >> 28;
+    }
+    t[0] += (tb * a[48]) + r[48];
+    t[1]  = (tb * a[49]) + r[49];
+    t[2]  = (tb * a[50]) + r[50];
+    t[3]  = (tb * a[51]) + r[51];
+    t[4]  = (tb * a[52]) + r[52];
+    t[5]  = (tb * a[53]) + r[53];
+    t[6]  = (tb * a[54]) + r[54];
+    t[7]  = (tb * a[55]) + r[55];
+    r[48] = t[0] & 0xfffffff;
+    t[1] += t[0] >> 28;
+    r[49] = t[1] & 0xfffffff;
+    t[2] += t[1] >> 28;
+    r[50] = t[2] & 0xfffffff;
+    t[3] += t[2] >> 28;
+    r[51] = t[3] & 0xfffffff;
+    t[4] += t[3] >> 28;
+    r[52] = t[4] & 0xfffffff;
+    t[5] += t[4] >> 28;
+    r[53] = t[5] & 0xfffffff;
+    t[6] += t[5] >> 28;
+    r[54] = t[6] & 0xfffffff;
+    t[7] += t[6] >> 28;
+    r[55] = t[7] & 0xfffffff;
+    r[56] +=  (sp_digit)(t[7] >> 28);
+#endif /* !WOLFSSL_SP_LARGE_CODE */
+}
+
+/* Shift the result in the high 1536 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_3072_mont_shift_56(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int64 n = a[54] >> 24;
+    n += ((sp_int64)a[55]) << 4;
+    for (i = 0; i < 48; i += 8) {
+        r[i + 0] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 56]) << 4;
+        r[i + 1] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 57]) << 4;
+        r[i + 2] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 58]) << 4;
+        r[i + 3] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 59]) << 4;
+        r[i + 4] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 60]) << 4;
+        r[i + 5] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 61]) << 4;
+        r[i + 6] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 62]) << 4;
+        r[i + 7] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 63]) << 4;
+    }
+    r[48] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[104]) << 4;
+    r[49] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[105]) << 4;
+    r[50] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[106]) << 4;
+    r[51] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[107]) << 4;
+    r[52] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[108]) << 4;
+    r[53] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[109]) << 4;
+    r[54] = (sp_digit)n;
+    XMEMSET(&r[55], 0, sizeof(*r) * 55U);
+}
+
+/* Reduce the number back to 3072 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_3072_mont_reduce_56(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_3072_norm_56(a + 55);
+
+    for (i=0; i<54; i++) {
+        mu = (a[i] * mp) & 0xfffffff;
+        sp_3072_mul_add_56(a+i, m, mu);
+        a[i+1] += a[i] >> 28;
+    }
+    mu = (a[i] * mp) & 0xffffffL;
+    sp_3072_mul_add_56(a+i, m, mu);
+    a[i+1] += a[i] >> 28;
+    a[i] &= 0xfffffff;
+    sp_3072_mont_shift_56(a, a);
+    sp_3072_cond_sub_56(a, a, m, 0 - (((a[54] - m[54]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_3072_norm_56(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_mul_56(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_3072_mul_56(r, a, b);
+    sp_3072_mont_reduce_56(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_sqr_56(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_3072_sqr_56(r, a);
+    sp_3072_mont_reduce_56(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_d_56(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    sp_digit t2;
+    sp_int64 p[4];
+    int i;
+
+    for (i = 0; i < 56; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 3] = (sp_digit)t2;
+    }
+    r[56] = (sp_digit)(t & 0xfffffff);
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_add_56(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 48; i += 8) {
+        r[i + 0] = a[i + 0] + (b[i + 0] & m);
+        r[i + 1] = a[i + 1] + (b[i + 1] & m);
+        r[i + 2] = a[i + 2] + (b[i + 2] & m);
+        r[i + 3] = a[i + 3] + (b[i + 3] & m);
+        r[i + 4] = a[i + 4] + (b[i + 4] & m);
+        r[i + 5] = a[i + 5] + (b[i + 5] & m);
+        r[i + 6] = a[i + 6] + (b[i + 6] & m);
+        r[i + 7] = a[i + 7] + (b[i + 7] & m);
+    }
+    r[48] = a[48] + (b[48] & m);
+    r[49] = a[49] + (b[49] & m);
+    r[50] = a[50] + (b[50] & m);
+    r[51] = a[51] + (b[51] & m);
+    r[52] = a[52] + (b[52] & m);
+    r[53] = a[53] + (b[53] & m);
+    r[54] = a[54] + (b[54] & m);
+}
+
+SP_NOINLINE static void sp_3072_rshift_56(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<48; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (28 - n)) & 0xfffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (28 - n)) & 0xfffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (28 - n)) & 0xfffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (28 - n)) & 0xfffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (28 - n)) & 0xfffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (28 - n)) & 0xfffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (28 - n)) & 0xfffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (28 - n)) & 0xfffffff);
+    }
+    r[48] = (a[48] >> n) | ((a[49] << (28 - n)) & 0xfffffff);
+    r[49] = (a[49] >> n) | ((a[50] << (28 - n)) & 0xfffffff);
+    r[50] = (a[50] >> n) | ((a[51] << (28 - n)) & 0xfffffff);
+    r[51] = (a[51] >> n) | ((a[52] << (28 - n)) & 0xfffffff);
+    r[52] = (a[52] >> n) | ((a[53] << (28 - n)) & 0xfffffff);
+    r[53] = (a[53] >> n) | ((a[54] << (28 - n)) & 0xfffffff);
+    r[54] = (a[54] >> n) | ((a[55] << (28 - n)) & 0xfffffff);
+    r[55] = a[55] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_32
+static WC_INLINE sp_digit sp_3072_div_word_56(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 28 bits from d1 and top 3 bits from d0. */
+    d = (d1 << 3) + (d0 >> 25);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 4 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 22) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 19) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 10 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 16) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 13) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 16 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 10) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 7) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 22 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 4) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 1) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 28 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 28 bits from d1 and top 3 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_32 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_div_56(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_32
+    sp_int64 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 56 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 56 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 112 + 1;
+        sd = t2 + 56 + 1;
+
+        sp_3072_mul_d_56(sd, d, (sp_digit)1 << 4);
+        sp_3072_mul_d_112(t1, a, (sp_digit)1 << 4);
+        dv = sd[54];
+        t1[55 + 55] += t1[55 + 55 - 1] >> 28;
+        t1[55 + 55 - 1] &= 0xfffffff;
+        for (i=55; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = t1[55 + i];
+            d1 <<= 28;
+            d1 += t1[55 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_56(t1[55 + i], t1[55 + i - 1], dv);
+#endif
+
+            sp_3072_mul_d_56(t2, sd, r1);
+            (void)sp_3072_sub_56(&t1[i], &t1[i], t2);
+            sp_3072_norm_55(&t1[i]);
+            t1[55 + i] += t1[55 + i - 1] >> 28;
+            t1[55 + i - 1] &= 0xfffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[55 + i];
+            d1 <<= 28;
+            d1 -= t1[55 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_56(-t1[55 + i], -t1[55 + i - 1], dv);
+#endif
+            r1 -= t1[55 + i];
+            sp_3072_mul_d_56(t2, sd, r1);
+            (void)sp_3072_add_56(&t1[i], &t1[i], t2);
+            t1[55 + i] += t1[55 + i - 1] >> 28;
+            t1[55 + i - 1] &= 0xfffffff;
+        }
+        t1[55 - 1] += t1[55 - 2] >> 28;
+        t1[55 - 2] &= 0xfffffff;
+        r1 = t1[55 - 1] / dv;
+
+        sp_3072_mul_d_56(t2, sd, r1);
+        sp_3072_sub_56(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 112U);
+        for (i=0; i<54; i++) {
+            r[i+1] += r[i] >> 28;
+            r[i] &= 0xfffffff;
+        }
+        sp_3072_cond_add_56(r, r, sd, 0 - ((r[54] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_3072_norm_55(r);
+        sp_3072_rshift_56(r, r, 4);
+        r[55] = 0;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_mod_56(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_3072_div_56(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_56(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 112];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 56 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 56 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 56U * 2U);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_56(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_56(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 56U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_3072_mul_56(t[1], t[1], norm);
+        err = sp_3072_mod_56(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 28;
+        c = bits % 28;
+        n = e[i--] << (28 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 28;
+            }
+
+            y = (int)((n >> 27) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_56(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 56 * 2);
+            sp_3072_mont_sqr_56(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 56 * 2);
+        }
+
+        sp_3072_mont_reduce_56(t[0], m, mp);
+        n = sp_3072_cmp_56(t[0], m);
+        sp_3072_cond_sub_56(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 56 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 112];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 56 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 56 * 2);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_56(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_56(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_56(t[1], t[1], norm);
+                err = sp_3072_mod_56(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_56(t[1], a, norm);
+            err = sp_3072_mod_56(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 28;
+        c = bits % 28;
+        n = e[i--] << (28 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 28;
+            }
+
+            y = (int)((n >> 27) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_56(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 56 * 2);
+            sp_3072_mont_sqr_56(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 56 * 2);
+        }
+
+        sp_3072_mont_reduce_56(t[0], m, mp);
+        n = sp_3072_cmp_56(t[0], m);
+        sp_3072_cond_sub_56(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 56 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(32 * 112) + 112];
+#endif
+    sp_digit* t[32];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 112) + 112), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<32; i++)
+            t[i] = td + i * 112;
+        rt = td + 3584;
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_56(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_56(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_56(t[1], t[1], norm);
+                err = sp_3072_mod_56(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_56(t[1], a, norm);
+            err = sp_3072_mod_56(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_56(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_56(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_56(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_56(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_56(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_56(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_3072_mont_sqr_56(t[ 8], t[ 4], m, mp);
+        sp_3072_mont_mul_56(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_3072_mont_sqr_56(t[10], t[ 5], m, mp);
+        sp_3072_mont_mul_56(t[11], t[ 6], t[ 5], m, mp);
+        sp_3072_mont_sqr_56(t[12], t[ 6], m, mp);
+        sp_3072_mont_mul_56(t[13], t[ 7], t[ 6], m, mp);
+        sp_3072_mont_sqr_56(t[14], t[ 7], m, mp);
+        sp_3072_mont_mul_56(t[15], t[ 8], t[ 7], m, mp);
+        sp_3072_mont_sqr_56(t[16], t[ 8], m, mp);
+        sp_3072_mont_mul_56(t[17], t[ 9], t[ 8], m, mp);
+        sp_3072_mont_sqr_56(t[18], t[ 9], m, mp);
+        sp_3072_mont_mul_56(t[19], t[10], t[ 9], m, mp);
+        sp_3072_mont_sqr_56(t[20], t[10], m, mp);
+        sp_3072_mont_mul_56(t[21], t[11], t[10], m, mp);
+        sp_3072_mont_sqr_56(t[22], t[11], m, mp);
+        sp_3072_mont_mul_56(t[23], t[12], t[11], m, mp);
+        sp_3072_mont_sqr_56(t[24], t[12], m, mp);
+        sp_3072_mont_mul_56(t[25], t[13], t[12], m, mp);
+        sp_3072_mont_sqr_56(t[26], t[13], m, mp);
+        sp_3072_mont_mul_56(t[27], t[14], t[13], m, mp);
+        sp_3072_mont_sqr_56(t[28], t[14], m, mp);
+        sp_3072_mont_mul_56(t[29], t[15], t[14], m, mp);
+        sp_3072_mont_sqr_56(t[30], t[15], m, mp);
+        sp_3072_mont_mul_56(t[31], t[16], t[15], m, mp);
+
+        bits = ((bits + 4) / 5) * 5;
+        i = ((bits + 27) / 28) - 1;
+        c = bits % 28;
+        if (c == 0) {
+            c = 28;
+        }
+        if (i < 56) {
+            n = e[i--] << (32 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 5) {
+            n |= e[i--] << (4 - c);
+            c += 28;
+        }
+        y = (int)((n >> 27) & 0x1f);
+        n <<= 5;
+        c -= 5;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 112);
+        while ((i >= 0) || (c >= 5)) {
+            if (c >= 5) {
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 4;
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c = 23;
+            }
+            else {
+                y = (byte)((n >> 27) & 0x1f);
+                n = e[i--] << 4;
+                c = 5 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 28 - c;
+            }
+
+            sp_3072_mont_sqr_56(rt, rt, m, mp);
+            sp_3072_mont_sqr_56(rt, rt, m, mp);
+            sp_3072_mont_sqr_56(rt, rt, m, mp);
+            sp_3072_mont_sqr_56(rt, rt, m, mp);
+            sp_3072_mont_sqr_56(rt, rt, m, mp);
+
+            sp_3072_mont_mul_56(rt, rt, t[y], m, mp);
+        }
+
+        sp_3072_mont_reduce_56(rt, m, mp);
+        n = sp_3072_cmp_56(rt, m);
+        sp_3072_cond_sub_56(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 112);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 3072 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_3072_mont_norm_112(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i = 0; i < 104; i += 8) {
+        r[i + 0] = 0xfffffff;
+        r[i + 1] = 0xfffffff;
+        r[i + 2] = 0xfffffff;
+        r[i + 3] = 0xfffffff;
+        r[i + 4] = 0xfffffff;
+        r[i + 5] = 0xfffffff;
+        r[i + 6] = 0xfffffff;
+        r[i + 7] = 0xfffffff;
+    }
+    r[104] = 0xfffffff;
+    r[105] = 0xfffffff;
+    r[106] = 0xfffffff;
+    r[107] = 0xfffffff;
+    r[108] = 0xfffffff;
+    r[109] = 0xfffffL;
+    r[110] = 0;
+    r[111] = 0;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_3072_sub_112(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_3072_cmp_112(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i = 104; i >= 0; i -= 8) {
+        r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 4] - b[i + 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 3] - b[i + 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 2] - b[i + 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_sub_112(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 112; i += 8) {
+        r[i + 0] = a[i + 0] - (b[i + 0] & m);
+        r[i + 1] = a[i + 1] - (b[i + 1] & m);
+        r[i + 2] = a[i + 2] - (b[i + 2] & m);
+        r[i + 3] = a[i + 3] - (b[i + 3] & m);
+        r[i + 4] = a[i + 4] - (b[i + 4] & m);
+        r[i + 5] = a[i + 5] - (b[i + 5] & m);
+        r[i + 6] = a[i + 6] - (b[i + 6] & m);
+        r[i + 7] = a[i + 7] - (b[i + 7] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_add_112(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 112; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0xfffffff;
+        t >>= 28;
+    }
+    r[112] += (sp_digit)t;
+#else
+    sp_int64 tb = b;
+    sp_int64 t[8];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 104; i += 8) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        t[4]  = (tb * a[i+4]) + r[i+4];
+        t[5]  = (tb * a[i+5]) + r[i+5];
+        t[6]  = (tb * a[i+6]) + r[i+6];
+        t[7]  = (tb * a[i+7]) + r[i+7];
+        r[i+0] = t[0] & 0xfffffff;
+        t[1] += t[0] >> 28;
+        r[i+1] = t[1] & 0xfffffff;
+        t[2] += t[1] >> 28;
+        r[i+2] = t[2] & 0xfffffff;
+        t[3] += t[2] >> 28;
+        r[i+3] = t[3] & 0xfffffff;
+        t[4] += t[3] >> 28;
+        r[i+4] = t[4] & 0xfffffff;
+        t[5] += t[4] >> 28;
+        r[i+5] = t[5] & 0xfffffff;
+        t[6] += t[5] >> 28;
+        r[i+6] = t[6] & 0xfffffff;
+        t[7] += t[6] >> 28;
+        r[i+7] = t[7] & 0xfffffff;
+        t[0]  = t[7] >> 28;
+    }
+    t[0] += (tb * a[104]) + r[104];
+    t[1]  = (tb * a[105]) + r[105];
+    t[2]  = (tb * a[106]) + r[106];
+    t[3]  = (tb * a[107]) + r[107];
+    t[4]  = (tb * a[108]) + r[108];
+    t[5]  = (tb * a[109]) + r[109];
+    t[6]  = (tb * a[110]) + r[110];
+    t[7]  = (tb * a[111]) + r[111];
+    r[104] = t[0] & 0xfffffff;
+    t[1] += t[0] >> 28;
+    r[105] = t[1] & 0xfffffff;
+    t[2] += t[1] >> 28;
+    r[106] = t[2] & 0xfffffff;
+    t[3] += t[2] >> 28;
+    r[107] = t[3] & 0xfffffff;
+    t[4] += t[3] >> 28;
+    r[108] = t[4] & 0xfffffff;
+    t[5] += t[4] >> 28;
+    r[109] = t[5] & 0xfffffff;
+    t[6] += t[5] >> 28;
+    r[110] = t[6] & 0xfffffff;
+    t[7] += t[6] >> 28;
+    r[111] = t[7] & 0xfffffff;
+    r[112] +=  (sp_digit)(t[7] >> 28);
+#endif /* !WOLFSSL_SP_LARGE_CODE */
+}
+
+/* Shift the result in the high 3072 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_3072_mont_shift_112(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int64 n = a[109] >> 20;
+    n += ((sp_int64)a[110]) << 8;
+    for (i = 0; i < 104; i += 8) {
+        r[i + 0] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 111]) << 8;
+        r[i + 1] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 112]) << 8;
+        r[i + 2] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 113]) << 8;
+        r[i + 3] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 114]) << 8;
+        r[i + 4] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 115]) << 8;
+        r[i + 5] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 116]) << 8;
+        r[i + 6] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 117]) << 8;
+        r[i + 7] = n & 0xfffffff;
+        n >>= 28; n += ((sp_int64)a[i + 118]) << 8;
+    }
+    r[104] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[215]) << 8;
+    r[105] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[216]) << 8;
+    r[106] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[217]) << 8;
+    r[107] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[218]) << 8;
+    r[108] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[219]) << 8;
+    r[109] = (sp_digit)n;
+    XMEMSET(&r[110], 0, sizeof(*r) * 110U);
+}
+
+/* Reduce the number back to 3072 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_3072_mont_reduce_112(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_3072_norm_112(a + 110);
+
+#ifdef WOLFSSL_SP_DH
+    if (mp != 1) {
+        for (i=0; i<109; i++) {
+            mu = (a[i] * mp) & 0xfffffff;
+            sp_3072_mul_add_112(a+i, m, mu);
+            a[i+1] += a[i] >> 28;
+        }
+        mu = (a[i] * mp) & 0xfffffL;
+        sp_3072_mul_add_112(a+i, m, mu);
+        a[i+1] += a[i] >> 28;
+        a[i] &= 0xfffffff;
+    }
+    else {
+        for (i=0; i<109; i++) {
+            mu = a[i] & 0xfffffff;
+            sp_3072_mul_add_112(a+i, m, mu);
+            a[i+1] += a[i] >> 28;
+        }
+        mu = a[i] & 0xfffffL;
+        sp_3072_mul_add_112(a+i, m, mu);
+        a[i+1] += a[i] >> 28;
+        a[i] &= 0xfffffff;
+    }
+#else
+    for (i=0; i<109; i++) {
+        mu = (a[i] * mp) & 0xfffffff;
+        sp_3072_mul_add_112(a+i, m, mu);
+        a[i+1] += a[i] >> 28;
+    }
+    mu = (a[i] * mp) & 0xfffffL;
+    sp_3072_mul_add_112(a+i, m, mu);
+    a[i+1] += a[i] >> 28;
+    a[i] &= 0xfffffff;
+#endif
+    sp_3072_mont_shift_112(a, a);
+    sp_3072_cond_sub_112(a, a, m, 0 - (((a[109] - m[109]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_3072_norm_112(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_mul_112(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_3072_mul_112(r, a, b);
+    sp_3072_mont_reduce_112(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_sqr_112(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_3072_sqr_112(r, a);
+    sp_3072_mont_reduce_112(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_d_224(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    sp_digit t2;
+    sp_int64 p[4];
+    int i;
+
+    for (i = 0; i < 224; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0xfffffff);
+        t >>= 28;
+        r[i + 3] = (sp_digit)t2;
+    }
+    r[224] = (sp_digit)(t & 0xfffffff);
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_add_112(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 104; i += 8) {
+        r[i + 0] = a[i + 0] + (b[i + 0] & m);
+        r[i + 1] = a[i + 1] + (b[i + 1] & m);
+        r[i + 2] = a[i + 2] + (b[i + 2] & m);
+        r[i + 3] = a[i + 3] + (b[i + 3] & m);
+        r[i + 4] = a[i + 4] + (b[i + 4] & m);
+        r[i + 5] = a[i + 5] + (b[i + 5] & m);
+        r[i + 6] = a[i + 6] + (b[i + 6] & m);
+        r[i + 7] = a[i + 7] + (b[i + 7] & m);
+    }
+    r[104] = a[104] + (b[104] & m);
+    r[105] = a[105] + (b[105] & m);
+    r[106] = a[106] + (b[106] & m);
+    r[107] = a[107] + (b[107] & m);
+    r[108] = a[108] + (b[108] & m);
+    r[109] = a[109] + (b[109] & m);
+}
+
+SP_NOINLINE static void sp_3072_rshift_112(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<104; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (28 - n)) & 0xfffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (28 - n)) & 0xfffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (28 - n)) & 0xfffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (28 - n)) & 0xfffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (28 - n)) & 0xfffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (28 - n)) & 0xfffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (28 - n)) & 0xfffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (28 - n)) & 0xfffffff);
+    }
+    r[104] = (a[104] >> n) | ((a[105] << (28 - n)) & 0xfffffff);
+    r[105] = (a[105] >> n) | ((a[106] << (28 - n)) & 0xfffffff);
+    r[106] = (a[106] >> n) | ((a[107] << (28 - n)) & 0xfffffff);
+    r[107] = (a[107] >> n) | ((a[108] << (28 - n)) & 0xfffffff);
+    r[108] = (a[108] >> n) | ((a[109] << (28 - n)) & 0xfffffff);
+    r[109] = (a[109] >> n) | ((a[110] << (28 - n)) & 0xfffffff);
+    r[110] = (a[110] >> n) | ((a[111] << (28 - n)) & 0xfffffff);
+    r[111] = a[111] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_32
+static WC_INLINE sp_digit sp_3072_div_word_112(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 28 bits from d1 and top 3 bits from d0. */
+    d = (d1 << 3) + (d0 >> 25);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 4 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 22) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 19) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 10 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 16) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 13) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 16 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 10) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 7) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 22 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 4) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 1) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 28 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 28 bits from d1 and top 3 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_32 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_div_112(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_32
+    sp_int64 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 112 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 112 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 224 + 1;
+        sd = t2 + 112 + 1;
+
+        sp_3072_mul_d_112(sd, d, (sp_digit)1 << 8);
+        sp_3072_mul_d_224(t1, a, (sp_digit)1 << 8);
+        dv = sd[109];
+        t1[110 + 110] += t1[110 + 110 - 1] >> 28;
+        t1[110 + 110 - 1] &= 0xfffffff;
+        for (i=110; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = t1[110 + i];
+            d1 <<= 28;
+            d1 += t1[110 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_112(t1[110 + i], t1[110 + i - 1], dv);
+#endif
+
+            sp_3072_mul_d_112(t2, sd, r1);
+            (void)sp_3072_sub_112(&t1[i], &t1[i], t2);
+            sp_3072_norm_110(&t1[i]);
+            t1[110 + i] += t1[110 + i - 1] >> 28;
+            t1[110 + i - 1] &= 0xfffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[110 + i];
+            d1 <<= 28;
+            d1 -= t1[110 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_112(-t1[110 + i], -t1[110 + i - 1], dv);
+#endif
+            r1 -= t1[110 + i];
+            sp_3072_mul_d_112(t2, sd, r1);
+            (void)sp_3072_add_112(&t1[i], &t1[i], t2);
+            t1[110 + i] += t1[110 + i - 1] >> 28;
+            t1[110 + i - 1] &= 0xfffffff;
+        }
+        t1[110 - 1] += t1[110 - 2] >> 28;
+        t1[110 - 2] &= 0xfffffff;
+        r1 = t1[110 - 1] / dv;
+
+        sp_3072_mul_d_112(t2, sd, r1);
+        sp_3072_sub_112(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 224U);
+        for (i=0; i<109; i++) {
+            r[i+1] += r[i] >> 28;
+            r[i] &= 0xfffffff;
+        }
+        sp_3072_cond_add_112(r, r, sd, 0 - ((r[109] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_3072_norm_110(r);
+        sp_3072_rshift_112(r, r, 8);
+        r[110] = 0;
+        r[111] = 0;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_mod_112(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_3072_div_112(a, m, NULL, r);
+}
+
+#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
+                                                     defined(WOLFSSL_HAVE_SP_DH)
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_112(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 224];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 112 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 112 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 112U * 2U);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_112(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_112(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 112U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_3072_mul_112(t[1], t[1], norm);
+        err = sp_3072_mod_112(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 28;
+        c = bits % 28;
+        n = e[i--] << (28 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 28;
+            }
+
+            y = (int)((n >> 27) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_112(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 112 * 2);
+            sp_3072_mont_sqr_112(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 112 * 2);
+        }
+
+        sp_3072_mont_reduce_112(t[0], m, mp);
+        n = sp_3072_cmp_112(t[0], m);
+        sp_3072_cond_sub_112(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 112 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 224];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 112 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 112 * 2);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_112(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_112(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_112(t[1], t[1], norm);
+                err = sp_3072_mod_112(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_112(t[1], a, norm);
+            err = sp_3072_mod_112(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 28;
+        c = bits % 28;
+        n = e[i--] << (28 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 28;
+            }
+
+            y = (int)((n >> 27) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_112(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 112 * 2);
+            sp_3072_mont_sqr_112(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 112 * 2);
+        }
+
+        sp_3072_mont_reduce_112(t[0], m, mp);
+        n = sp_3072_cmp_112(t[0], m);
+        sp_3072_cond_sub_112(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 112 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(16 * 224) + 224];
+#endif
+    sp_digit* t[16];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 224) + 224), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<16; i++)
+            t[i] = td + i * 224;
+        rt = td + 3584;
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_112(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_112(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_112(t[1], t[1], norm);
+                err = sp_3072_mod_112(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_112(t[1], a, norm);
+            err = sp_3072_mod_112(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_112(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_112(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_112(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_112(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_112(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_112(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_3072_mont_sqr_112(t[ 8], t[ 4], m, mp);
+        sp_3072_mont_mul_112(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_3072_mont_sqr_112(t[10], t[ 5], m, mp);
+        sp_3072_mont_mul_112(t[11], t[ 6], t[ 5], m, mp);
+        sp_3072_mont_sqr_112(t[12], t[ 6], m, mp);
+        sp_3072_mont_mul_112(t[13], t[ 7], t[ 6], m, mp);
+        sp_3072_mont_sqr_112(t[14], t[ 7], m, mp);
+        sp_3072_mont_mul_112(t[15], t[ 8], t[ 7], m, mp);
+
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 27) / 28) - 1;
+        c = bits % 28;
+        if (c == 0) {
+            c = 28;
+        }
+        if (i < 112) {
+            n = e[i--] << (32 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 4) {
+            n |= e[i--] << (4 - c);
+            c += 28;
+        }
+        y = (int)((n >> 28) & 0xf);
+        n <<= 4;
+        c -= 4;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 224);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 4;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 24;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 4;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 28 - c;
+            }
+
+            sp_3072_mont_sqr_112(rt, rt, m, mp);
+            sp_3072_mont_sqr_112(rt, rt, m, mp);
+            sp_3072_mont_sqr_112(rt, rt, m, mp);
+            sp_3072_mont_sqr_112(rt, rt, m, mp);
+
+            sp_3072_mont_mul_112(rt, rt, t[y], m, mp);
+        }
+
+        sp_3072_mont_reduce_112(rt, m, mp);
+        n = sp_3072_cmp_112(rt, m);
+        sp_3072_cond_sub_112(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 224);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */
+       /* WOLFSSL_HAVE_SP_DH */
+
+#ifdef WOLFSSL_HAVE_SP_RSA
+/* RSA public key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * em      Public exponent.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 384 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
+    const mp_int* mm, byte* out, word32* outLen)
+{
+#ifdef WOLFSSL_SP_SMALL
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[112 * 5];
+#endif
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit* norm = NULL;
+    sp_digit e[1] = {0};
+    sp_digit mp;
+    int i;
+    int err = MP_OKAY;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 28) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 384U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        r = a + 112 * 2;
+        m = r + 112 * 2;
+        norm = r;
+
+        sp_3072_from_bin(a, 112, in, inLen);
+#if DIGIT_BIT >= 28
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(m, 112, mm);
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_112(norm, m);
+    }
+    if (err == MP_OKAY) {
+        sp_3072_mul_112(a, a, norm);
+        err = sp_3072_mod_112(a, a, m);
+    }
+    if (err == MP_OKAY) {
+        for (i=27; i>=0; i--) {
+            if ((e[0] >> i) != 0) {
+                break;
+            }
+        }
+
+        XMEMCPY(r, a, sizeof(sp_digit) * 112 * 2);
+        for (i--; i>=0; i--) {
+            sp_3072_mont_sqr_112(r, r, m, mp);
+
+            if (((e[0] >> i) & 1) == 1) {
+                sp_3072_mont_mul_112(r, r, a, m, mp);
+            }
+        }
+        sp_3072_mont_reduce_112(r, m, mp);
+        mp = sp_3072_cmp_112(r, m);
+        sp_3072_cond_sub_112(r, r, m, ((mp < 0) ?
+                    (sp_digit)1 : (sp_digit)0)- 1);
+
+        sp_3072_to_bin_112(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[112 * 5];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit e[1] = {0};
+    int err = MP_OKAY;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 28) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 384U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d;
+        r = a + 112 * 2;
+        m = r + 112 * 2;
+
+        sp_3072_from_bin(a, 112, in, inLen);
+#if DIGIT_BIT >= 28
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(m, 112, mm);
+
+        if (e[0] == 0x3) {
+            sp_3072_sqr_112(r, a);
+            err = sp_3072_mod_112(r, r, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_112(r, a, r);
+                err = sp_3072_mod_112(r, r, m);
+            }
+        }
+        else {
+            sp_digit* norm = r;
+            int i;
+            sp_digit mp;
+
+            sp_3072_mont_setup(m, &mp);
+            sp_3072_mont_norm_112(norm, m);
+
+            sp_3072_mul_112(a, a, norm);
+            err = sp_3072_mod_112(a, a, m);
+
+            if (err == MP_OKAY) {
+                for (i=27; i>=0; i--) {
+                    if ((e[0] >> i) != 0) {
+                        break;
+                    }
+                }
+
+                XMEMCPY(r, a, sizeof(sp_digit) * 224U);
+                for (i--; i>=0; i--) {
+                    sp_3072_mont_sqr_112(r, r, m, mp);
+
+                    if (((e[0] >> i) & 1) == 1) {
+                        sp_3072_mont_mul_112(r, r, a, m, mp);
+                    }
+                }
+                sp_3072_mont_reduce_112(r, m, mp);
+                mp = sp_3072_cmp_112(r, m);
+                sp_3072_cond_sub_112(r, r, m, ((mp < 0) ?
+                           (sp_digit)1 : (sp_digit)0) - 1);
+            }
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_to_bin_112(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+#ifndef WOLFSSL_RSA_PUBLIC_ONLY
+#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
+#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
+/* RSA private key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * dm      Private exponent.
+ * pm      First prime.
+ * qm      Second prime.
+ * dpm     First prime's CRT exponent.
+ * dqm     Second prime's CRT exponent.
+ * qim     Inverse of second prime mod p.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 384 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
+    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
+    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
+{
+#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit  d[112 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 3072) {
+           err = MP_READ_E;
+        }
+        else if (inLen > 384) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 112;
+        m = a + 224;
+        r = a;
+
+        sp_3072_from_bin(a, 112, in, inLen);
+        sp_3072_from_mp(d, 112, dm);
+        sp_3072_from_mp(m, 112, mm);
+        err = sp_3072_mod_exp_112(r, a, d, 3072, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_to_bin_112(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 112);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[112 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 3072) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 384U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 112;
+        m = a + 224;
+        r = a;
+
+        sp_3072_from_bin(a, 112, in, inLen);
+        sp_3072_from_mp(d, 112, dm);
+        sp_3072_from_mp(m, 112, mm);
+        err = sp_3072_mod_exp_112(r, a, d, 3072, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_to_bin_112(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 112);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#else
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[56 * 8];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 384) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 56 * 8, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+    if (err == MP_OKAY) {
+        p = a + 112;
+        qi = dq = dp = p + 56;
+        tmpa = qi + 56;
+        tmpb = tmpa + 112;
+        r = a;
+
+        sp_3072_from_bin(a, 112, in, inLen);
+        sp_3072_from_mp(p, 56, pm);
+        sp_3072_from_mp(dp, 56, dpm);
+        err = sp_3072_mod_exp_56(tmpa, a, dp, 1536, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(p, 56, qm);
+        sp_3072_from_mp(dq, 56, dqm);
+        err = sp_3072_mod_exp_56(tmpb, a, dq, 1536, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(p, 56, pm);
+        (void)sp_3072_sub_56(tmpa, tmpa, tmpb);
+        sp_3072_norm_55(tmpa);
+        sp_3072_cond_add_56(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[54] >> 31));
+        sp_3072_cond_add_56(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[54] >> 31));
+
+        sp_3072_from_mp(qi, 56, qim);
+        sp_3072_mul_56(tmpa, tmpa, qi);
+        err = sp_3072_mod_56(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(p, 56, qm);
+        sp_3072_mul_56(tmpa, p, tmpa);
+        (void)sp_3072_add_112(r, tmpb, tmpa);
+        sp_3072_norm_112(r);
+
+        sp_3072_to_bin_112(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 56 * 8);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[56 * 13];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* q = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 384U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 56 * 13, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        p = a + 112 * 2;
+        q = p + 56;
+        dp = q + 56;
+        dq = dp + 56;
+        qi = dq + 56;
+        tmpa = qi + 56;
+        tmpb = tmpa + 112;
+        r = a;
+
+        sp_3072_from_bin(a, 112, in, inLen);
+        sp_3072_from_mp(p, 56, pm);
+        sp_3072_from_mp(q, 56, qm);
+        sp_3072_from_mp(dp, 56, dpm);
+        sp_3072_from_mp(dq, 56, dqm);
+        sp_3072_from_mp(qi, 56, qim);
+
+        err = sp_3072_mod_exp_56(tmpa, a, dp, 1536, p, 1);
+    }
+    if (err == MP_OKAY) {
+        err = sp_3072_mod_exp_56(tmpb, a, dq, 1536, q, 1);
+    }
+
+    if (err == MP_OKAY) {
+        (void)sp_3072_sub_56(tmpa, tmpa, tmpb);
+        sp_3072_norm_55(tmpa);
+        sp_3072_cond_add_56(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[54] >> 31));
+        sp_3072_cond_add_56(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[54] >> 31));
+        sp_3072_mul_56(tmpa, tmpa, qi);
+        err = sp_3072_mod_56(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mul_56(tmpa, tmpa, q);
+        (void)sp_3072_add_112(r, tmpb, tmpa);
+        sp_3072_norm_112(r);
+
+        sp_3072_to_bin_112(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 56 * 13);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+    #endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
+}
+
+#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
+#endif /* WOLFSSL_HAVE_SP_RSA */
+#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
+                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
+/* Convert an array of sp_digit to an mp_int.
+ *
+ * a  A single precision integer.
+ * r  A multi-precision integer.
+ */
+static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
+{
+    int err;
+
+    err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
+    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
+#if DIGIT_BIT == 28
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 112);
+        r->used = 112;
+        mp_clamp(r);
+#elif DIGIT_BIT < 28
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 112; i++) {
+            r->dp[j] |= (mp_digit)(a[i] << s);
+            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+            s = DIGIT_BIT - s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
+            while (s + DIGIT_BIT <= 28) {
+                s += DIGIT_BIT;
+                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+                if (s == SP_WORD_SIZE) {
+                    r->dp[j] = 0;
+                }
+                else {
+                    r->dp[j] = (mp_digit)(a[i] >> s);
+                }
+            }
+            s = 28 - s;
+        }
+        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#else
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 112; i++) {
+            r->dp[j] |= ((mp_digit)a[i]) << s;
+            if (s + 28 >= DIGIT_BIT) {
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
+                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+    #endif
+                s = DIGIT_BIT - s;
+                r->dp[++j] = a[i] >> s;
+                s = 28 - s;
+            }
+            else {
+                s += 28;
+            }
+        }
+        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#endif
+    }
+
+    return err;
+}
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base  Base. MP integer.
+ * exp   Exponent. MP integer.
+ * mod   Modulus. MP integer.
+ * res   Result. MP integer.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
+    mp_int* res)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int err = MP_OKAY;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[112 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 3072) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 112 * 2;
+        m = e + 112;
+        r = b;
+
+        sp_3072_from_mp(b, 112, base);
+        sp_3072_from_mp(e, 112, exp);
+        sp_3072_from_mp(m, 112, mod);
+
+        err = sp_3072_mod_exp_112(r, b, e, mp_count_bits(exp), m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_3072_to_mp(r, res);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 112U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[112 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 3072) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL, DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 112 * 2;
+        m = e + 112;
+        r = b;
+
+        sp_3072_from_mp(b, 112, base);
+        sp_3072_from_mp(e, 112, exp);
+        sp_3072_from_mp(m, 112, mod);
+
+        err = sp_3072_mod_exp_112(r, b, e, expBits, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_3072_to_mp(r, res);
+    }
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 112U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+#endif
+}
+
+#ifdef WOLFSSL_HAVE_SP_DH
+
+#ifdef HAVE_FFDHE_3072
+SP_NOINLINE static void sp_3072_lshift_112(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    sp_int_digit s;
+    sp_int_digit t;
+
+    s = (sp_int_digit)a[111];
+    r[112] = s >> (28U - n);
+    s = (sp_int_digit)(a[111]); t = (sp_int_digit)(a[110]);
+    r[111] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[110]); t = (sp_int_digit)(a[109]);
+    r[110] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[109]); t = (sp_int_digit)(a[108]);
+    r[109] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[108]); t = (sp_int_digit)(a[107]);
+    r[108] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[107]); t = (sp_int_digit)(a[106]);
+    r[107] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[106]); t = (sp_int_digit)(a[105]);
+    r[106] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[105]); t = (sp_int_digit)(a[104]);
+    r[105] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[104]); t = (sp_int_digit)(a[103]);
+    r[104] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[103]); t = (sp_int_digit)(a[102]);
+    r[103] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[102]); t = (sp_int_digit)(a[101]);
+    r[102] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[101]); t = (sp_int_digit)(a[100]);
+    r[101] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[100]); t = (sp_int_digit)(a[99]);
+    r[100] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[99]); t = (sp_int_digit)(a[98]);
+    r[99] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[98]); t = (sp_int_digit)(a[97]);
+    r[98] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[97]); t = (sp_int_digit)(a[96]);
+    r[97] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[96]); t = (sp_int_digit)(a[95]);
+    r[96] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[95]); t = (sp_int_digit)(a[94]);
+    r[95] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[94]); t = (sp_int_digit)(a[93]);
+    r[94] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[93]); t = (sp_int_digit)(a[92]);
+    r[93] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[92]); t = (sp_int_digit)(a[91]);
+    r[92] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[91]); t = (sp_int_digit)(a[90]);
+    r[91] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[90]); t = (sp_int_digit)(a[89]);
+    r[90] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[89]); t = (sp_int_digit)(a[88]);
+    r[89] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[88]); t = (sp_int_digit)(a[87]);
+    r[88] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[87]); t = (sp_int_digit)(a[86]);
+    r[87] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[86]); t = (sp_int_digit)(a[85]);
+    r[86] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[85]); t = (sp_int_digit)(a[84]);
+    r[85] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[84]); t = (sp_int_digit)(a[83]);
+    r[84] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[83]); t = (sp_int_digit)(a[82]);
+    r[83] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[82]); t = (sp_int_digit)(a[81]);
+    r[82] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[81]); t = (sp_int_digit)(a[80]);
+    r[81] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[80]); t = (sp_int_digit)(a[79]);
+    r[80] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[79]); t = (sp_int_digit)(a[78]);
+    r[79] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[78]); t = (sp_int_digit)(a[77]);
+    r[78] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[77]); t = (sp_int_digit)(a[76]);
+    r[77] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[76]); t = (sp_int_digit)(a[75]);
+    r[76] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[75]); t = (sp_int_digit)(a[74]);
+    r[75] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[74]); t = (sp_int_digit)(a[73]);
+    r[74] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[73]); t = (sp_int_digit)(a[72]);
+    r[73] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[72]); t = (sp_int_digit)(a[71]);
+    r[72] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[71]); t = (sp_int_digit)(a[70]);
+    r[71] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[70]); t = (sp_int_digit)(a[69]);
+    r[70] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[69]); t = (sp_int_digit)(a[68]);
+    r[69] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[68]); t = (sp_int_digit)(a[67]);
+    r[68] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[67]); t = (sp_int_digit)(a[66]);
+    r[67] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[66]); t = (sp_int_digit)(a[65]);
+    r[66] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[65]); t = (sp_int_digit)(a[64]);
+    r[65] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[64]); t = (sp_int_digit)(a[63]);
+    r[64] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[63]); t = (sp_int_digit)(a[62]);
+    r[63] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[62]); t = (sp_int_digit)(a[61]);
+    r[62] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[61]); t = (sp_int_digit)(a[60]);
+    r[61] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[60]); t = (sp_int_digit)(a[59]);
+    r[60] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[59]); t = (sp_int_digit)(a[58]);
+    r[59] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[58]); t = (sp_int_digit)(a[57]);
+    r[58] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[57]); t = (sp_int_digit)(a[56]);
+    r[57] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[56]); t = (sp_int_digit)(a[55]);
+    r[56] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[55]); t = (sp_int_digit)(a[54]);
+    r[55] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[54]); t = (sp_int_digit)(a[53]);
+    r[54] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[53]); t = (sp_int_digit)(a[52]);
+    r[53] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[52]); t = (sp_int_digit)(a[51]);
+    r[52] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[51]); t = (sp_int_digit)(a[50]);
+    r[51] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[50]); t = (sp_int_digit)(a[49]);
+    r[50] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[49]); t = (sp_int_digit)(a[48]);
+    r[49] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[48]); t = (sp_int_digit)(a[47]);
+    r[48] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[47]); t = (sp_int_digit)(a[46]);
+    r[47] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[46]); t = (sp_int_digit)(a[45]);
+    r[46] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[45]); t = (sp_int_digit)(a[44]);
+    r[45] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[44]); t = (sp_int_digit)(a[43]);
+    r[44] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[43]); t = (sp_int_digit)(a[42]);
+    r[43] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[42]); t = (sp_int_digit)(a[41]);
+    r[42] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[41]); t = (sp_int_digit)(a[40]);
+    r[41] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[40]); t = (sp_int_digit)(a[39]);
+    r[40] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[39]); t = (sp_int_digit)(a[38]);
+    r[39] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[38]); t = (sp_int_digit)(a[37]);
+    r[38] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[37]); t = (sp_int_digit)(a[36]);
+    r[37] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[36]); t = (sp_int_digit)(a[35]);
+    r[36] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[35]); t = (sp_int_digit)(a[34]);
+    r[35] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[34]); t = (sp_int_digit)(a[33]);
+    r[34] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[33]); t = (sp_int_digit)(a[32]);
+    r[33] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[32]); t = (sp_int_digit)(a[31]);
+    r[32] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[31]); t = (sp_int_digit)(a[30]);
+    r[31] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[30]); t = (sp_int_digit)(a[29]);
+    r[30] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
+    r[29] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
+    r[28] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
+    r[27] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
+    r[26] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
+    r[25] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
+    r[24] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
+    r[23] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
+    r[22] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
+    r[21] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
+    r[20] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
+    r[19] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
+    r[18] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
+    r[17] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
+    r[16] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
+    r[15] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
+    r[14] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
+    r[13] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
+    r[12] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
+    r[11] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
+    r[10] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
+    r[9] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
+    r[8] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
+    r[7] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
+    r[6] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
+    r[5] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
+    r[4] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
+    r[3] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
+    r[2] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
+    r[1] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
+    r[0] = (a[0] << n) & 0xfffffff;
+}
+
+/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_2_112(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[337];
+#endif
+    sp_digit* norm = NULL;
+    sp_digit* tmp = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit o;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 337, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        tmp  = td + 224;
+        XMEMSET(td, 0, sizeof(sp_digit) * 337);
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_112(norm, m);
+
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 27) / 28) - 1;
+        c = bits % 28;
+        if (c == 0) {
+            c = 28;
+        }
+        if (i < 112) {
+            n = e[i--] << (32 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 4) {
+            n |= e[i--] << (4 - c);
+            c += 28;
+        }
+        y = (int)((n >> 28) & 0xf);
+        n <<= 4;
+        c -= 4;
+        sp_3072_lshift_112(r, norm, (byte)y);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 4;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 24;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 4;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 28 - c;
+            }
+
+            sp_3072_mont_sqr_112(r, r, m, mp);
+            sp_3072_mont_sqr_112(r, r, m, mp);
+            sp_3072_mont_sqr_112(r, r, m, mp);
+            sp_3072_mont_sqr_112(r, r, m, mp);
+
+            sp_3072_lshift_112(r, r, (byte)y);
+            sp_3072_mul_d_112(tmp, norm, (r[110] << 8) + (r[109] >> 20));
+            r[110] = 0;
+            r[109] &= 0xfffffL;
+            (void)sp_3072_add_112(r, r, tmp);
+            sp_3072_norm_112(r);
+            o = sp_3072_cmp_112(r, m);
+            sp_3072_cond_sub_112(r, r, m, ((o < 0) ?
+                                          (sp_digit)1 : (sp_digit)0) - 1);
+        }
+
+        sp_3072_mont_reduce_112(r, m, mp);
+        n = sp_3072_cmp_112(r, m);
+        sp_3072_cond_sub_112(r, r, m, ((n < 0) ?
+                                                (sp_digit)1 : (sp_digit)0) - 1);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+#endif /* HAVE_FFDHE_3072 */
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base     Base.
+ * exp      Array of bytes that is the exponent.
+ * expLen   Length of data, in bytes, in exponent.
+ * mod      Modulus.
+ * out      Buffer to hold big-endian bytes of exponentiation result.
+ *          Must be at least 384 bytes long.
+ * outLen   Length, in bytes, of exponentiation result.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
+    const mp_int* mod, byte* out, word32* outLen)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[112 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    word32 i;
+    int err = MP_OKAY;
+
+    if (mp_count_bits(base) > 3072) {
+        err = MP_READ_E;
+    }
+    else if (expLen > 384U) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 112 * 2;
+        m = e + 112;
+        r = b;
+
+        sp_3072_from_mp(b, 112, base);
+        sp_3072_from_bin(e, 112, exp, expLen);
+        sp_3072_from_mp(m, 112, mod);
+
+    #ifdef HAVE_FFDHE_3072
+        if (base->used == 1 && base->dp[0] == 2U &&
+                (m[109] >> 4) == 0xffffL) {
+            err = sp_3072_mod_exp_2_112(r, e, expLen * 8U, m);
+        }
+        else {
+    #endif
+            err = sp_3072_mod_exp_112(r, b, e, expLen * 8U, m, 0);
+    #ifdef HAVE_FFDHE_3072
+        }
+    #endif
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_to_bin_112(r, out);
+        *outLen = 384;
+        for (i=0; i<384U && out[i] == 0U; i++) {
+            /* Search for first non-zero. */
+        }
+        *outLen -= i;
+        XMEMMOVE(out, out + i, *outLen);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 112U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+}
+#endif /* WOLFSSL_HAVE_SP_DH */
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base  Base. MP integer.
+ * exp   Exponent. MP integer.
+ * mod   Modulus. MP integer.
+ * res   Result. MP integer.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
+    mp_int* res)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int err = MP_OKAY;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[56 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 1536) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 1536) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 1536) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 56 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 56 * 2;
+        m = e + 56;
+        r = b;
+
+        sp_3072_from_mp(b, 56, base);
+        sp_3072_from_mp(e, 56, exp);
+        sp_3072_from_mp(m, 56, mod);
+
+        err = sp_3072_mod_exp_56(r, b, e, mp_count_bits(exp), m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        XMEMSET(r + 56, 0, sizeof(*r) * 56U);
+        err = sp_3072_to_mp(r, res);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 112U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[56 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 1536) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 1536) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 1536) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 56 * 4, NULL, DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 56 * 2;
+        m = e + 56;
+        r = b;
+
+        sp_3072_from_mp(b, 56, base);
+        sp_3072_from_mp(e, 56, exp);
+        sp_3072_from_mp(m, 56, mod);
+
+        err = sp_3072_mod_exp_56(r, b, e, expBits, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        XMEMSET(r + 56, 0, sizeof(*r) * 56U);
+        err = sp_3072_to_mp(r, res);
+    }
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 112U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+#endif
+}
+
+#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
+
+#endif /* WOLFSSL_SP_SMALL */
+#endif /* !WOLFSSL_SP_NO_3072 */
+
+#ifdef WOLFSSL_SP_4096
+#ifdef WOLFSSL_SP_SMALL
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = n-1; i >= 0; i--) {
+        r[j] |= (((sp_digit)a[i]) << s);
+        if (s >= 21U) {
+            r[j] &= 0x1fffffff;
+            s = 29U - s;
+            if (j + 1 >= size) {
+                break;
+            }
+            r[++j] = (sp_digit)a[i] >> s;
+            s = 8U - s;
+        }
+        else {
+            s += 8U;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+}
+
+/* Convert an mp_int to an array of sp_digit.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  A multi-precision integer.
+ */
+static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
+{
+#if DIGIT_BIT == 29
+    int j;
+
+    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);
+
+    for (j = a->used; j < size; j++) {
+        r[j] = 0;
+    }
+#elif DIGIT_BIT > 29
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i] << s);
+        r[j] &= 0x1fffffff;
+        s = 29U - s;
+        if (j + 1 >= size) {
+            break;
+        }
+        /* lint allow cast of mismatch word32 and mp_digit */
+        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+        while ((s + 29U) <= (word32)DIGIT_BIT) {
+            s += 29U;
+            r[j] &= 0x1fffffff;
+            if (j + 1 >= size) {
+                break;
+            }
+            if (s < (word32)DIGIT_BIT) {
+                /* lint allow cast of mismatch word32 and mp_digit */
+                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+            }
+            else {
+                r[++j] = (sp_digit)0;
+            }
+        }
+        s = (word32)DIGIT_BIT - s;
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#else
+    int i;
+    int j = 0;
+    int s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i]) << s;
+        if (s + DIGIT_BIT >= 29) {
+            r[j] &= 0x1fffffff;
+            if (j + 1 >= size) {
+                break;
+            }
+            s = 29 - s;
             if (s == DIGIT_BIT) {
                 r[++j] = 0;
                 s = 0;
@@ -8652,20 +12654,20 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_4096_to_bin(sp_digit* r, byte* a)
+static void sp_4096_to_bin_142(sp_digit* r, byte* a)
 {
     int i;
     int j;
     int s = 0;
     int b;
 
-    for (i=0; i<195; i++) {
-        r[i+1] += r[i] >> 21;
-        r[i] &= 0x1fffff;
+    for (i=0; i<141; i++) {
+        r[i+1] += r[i] >> 29;
+        r[i] &= 0x1fffffff;
     }
     j = 4096 / 8 - 1;
     a[j] = 0;
-    for (i=0; i<196 && j>=0; i++) {
+    for (i=0; i<142 && j>=0; i++) {
         b = 0;
         /* lint allow cast of mismatch sp_digit and int */
         a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
@@ -8673,14 +12675,14 @@ static void sp_4096_to_bin(sp_digit* r, byte* a)
         if (j < 0) {
             break;
         }
-        while (b < 21) {
+        while (b < 29) {
             a[j--] = (byte)(r[i] >> b);
             b += 8;
             if (j < 0) {
                 break;
             }
         }
-        s = 8 - (b - 21);
+        s = 8 - (b - 29);
         if (j >= 0) {
             a[j] = 0;
         }
@@ -8690,476 +12692,87 @@ static void sp_4096_to_bin(sp_digit* r, byte* a)
     }
 }
 
-#ifndef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_mul_49(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int64_t t[98];
-
-    XMEMSET(t, 0, sizeof(t));
-    for (i=0; i<49; i++) {
-        for (j=0; j<49; j++) {
-            t[i+j] += ((int64_t)a[i]) * b[j];
-        }
-    }
-    for (i=0; i<97; i++) {
-        r[i] = t[i] & 0x1fffff;
-        t[i+1] += t[i] >> 21;
-    }
-    r[97] = (sp_digit)t[97];
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_sqr_49(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int64_t t[98];
-
-    XMEMSET(t, 0, sizeof(t));
-    for (i=0; i<49; i++) {
-        for (j=0; j<i; j++) {
-            t[i+j] += (((int64_t)a[i]) * a[j]) * 2;
-        }
-        t[i+i] += ((int64_t)a[i]) * a[i];
-    }
-    for (i=0; i<97; i++) {
-        r[i] = t[i] & 0x1fffff;
-        t[i+1] += t[i] >> 21;
-    }
-    r[97] = (sp_digit)t[97];
-}
-
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_add_49(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 48; i += 8) {
-        r[i + 0] = a[i + 0] + b[i + 0];
-        r[i + 1] = a[i + 1] + b[i + 1];
-        r[i + 2] = a[i + 2] + b[i + 2];
-        r[i + 3] = a[i + 3] + b[i + 3];
-        r[i + 4] = a[i + 4] + b[i + 4];
-        r[i + 5] = a[i + 5] + b[i + 5];
-        r[i + 6] = a[i + 6] + b[i + 6];
-        r[i + 7] = a[i + 7] + b[i + 7];
-    }
-    r[48] = a[48] + b[48];
-
-    return 0;
-}
-
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_add_98(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 96; i += 8) {
-        r[i + 0] = a[i + 0] + b[i + 0];
-        r[i + 1] = a[i + 1] + b[i + 1];
-        r[i + 2] = a[i + 2] + b[i + 2];
-        r[i + 3] = a[i + 3] + b[i + 3];
-        r[i + 4] = a[i + 4] + b[i + 4];
-        r[i + 5] = a[i + 5] + b[i + 5];
-        r[i + 6] = a[i + 6] + b[i + 6];
-        r[i + 7] = a[i + 7] + b[i + 7];
-    }
-    r[96] = a[96] + b[96];
-    r[97] = a[97] + b[97];
-
-    return 0;
-}
-
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_sub_98(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 96; i += 8) {
-        r[i + 0] = a[i + 0] - b[i + 0];
-        r[i + 1] = a[i + 1] - b[i + 1];
-        r[i + 2] = a[i + 2] - b[i + 2];
-        r[i + 3] = a[i + 3] - b[i + 3];
-        r[i + 4] = a[i + 4] - b[i + 4];
-        r[i + 5] = a[i + 5] - b[i + 5];
-        r[i + 6] = a[i + 6] - b[i + 6];
-        r[i + 7] = a[i + 7] - b[i + 7];
-    }
-    r[96] = a[96] - b[96];
-    r[97] = a[97] - b[97];
-
-    return 0;
-}
-
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_mul_98(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    sp_digit* z0 = r;
-    sp_digit z1[98];
-    sp_digit* a1 = z1;
-    sp_digit b1[49];
-    sp_digit* z2 = r + 98;
-    (void)sp_4096_add_49(a1, a, &a[49]);
-    (void)sp_4096_add_49(b1, b, &b[49]);
-    sp_4096_mul_49(z2, &a[49], &b[49]);
-    sp_4096_mul_49(z0, a, b);
-    sp_4096_mul_49(z1, a1, b1);
-    (void)sp_4096_sub_98(z1, z1, z2);
-    (void)sp_4096_sub_98(z1, z1, z0);
-    (void)sp_4096_add_98(r + 49, r + 49, z1);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_sqr_98(sp_digit* r, const sp_digit* a)
-{
-    sp_digit* z0 = r;
-    sp_digit z1[98];
-    sp_digit* a1 = z1;
-    sp_digit* z2 = r + 98;
-    (void)sp_4096_add_49(a1, a, &a[49]);
-    sp_4096_sqr_49(z2, &a[49]);
-    sp_4096_sqr_49(z0, a);
-    sp_4096_sqr_49(z1, a1);
-    (void)sp_4096_sub_98(z1, z1, z2);
-    (void)sp_4096_sub_98(z1, z1, z0);
-    (void)sp_4096_add_98(r + 49, r + 49, z1);
-}
-
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_add_196(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 192; i += 8) {
-        r[i + 0] = a[i + 0] + b[i + 0];
-        r[i + 1] = a[i + 1] + b[i + 1];
-        r[i + 2] = a[i + 2] + b[i + 2];
-        r[i + 3] = a[i + 3] + b[i + 3];
-        r[i + 4] = a[i + 4] + b[i + 4];
-        r[i + 5] = a[i + 5] + b[i + 5];
-        r[i + 6] = a[i + 6] + b[i + 6];
-        r[i + 7] = a[i + 7] + b[i + 7];
-    }
-    r[192] = a[192] + b[192];
-    r[193] = a[193] + b[193];
-    r[194] = a[194] + b[194];
-    r[195] = a[195] + b[195];
-
-    return 0;
-}
-
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_sub_196(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 192; i += 8) {
-        r[i + 0] = a[i + 0] - b[i + 0];
-        r[i + 1] = a[i + 1] - b[i + 1];
-        r[i + 2] = a[i + 2] - b[i + 2];
-        r[i + 3] = a[i + 3] - b[i + 3];
-        r[i + 4] = a[i + 4] - b[i + 4];
-        r[i + 5] = a[i + 5] - b[i + 5];
-        r[i + 6] = a[i + 6] - b[i + 6];
-        r[i + 7] = a[i + 7] - b[i + 7];
-    }
-    r[192] = a[192] - b[192];
-    r[193] = a[193] - b[193];
-    r[194] = a[194] - b[194];
-    r[195] = a[195] - b[195];
-
-    return 0;
-}
-
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_mul_196(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    sp_digit* z0 = r;
-    sp_digit z1[196];
-    sp_digit* a1 = z1;
-    sp_digit b1[98];
-    sp_digit* z2 = r + 196;
-    (void)sp_4096_add_98(a1, a, &a[98]);
-    (void)sp_4096_add_98(b1, b, &b[98]);
-    sp_4096_mul_98(z2, &a[98], &b[98]);
-    sp_4096_mul_98(z0, a, b);
-    sp_4096_mul_98(z1, a1, b1);
-    (void)sp_4096_sub_196(z1, z1, z2);
-    (void)sp_4096_sub_196(z1, z1, z0);
-    (void)sp_4096_add_196(r + 98, r + 98, z1);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_sqr_196(sp_digit* r, const sp_digit* a)
-{
-    sp_digit* z0 = r;
-    sp_digit z1[196];
-    sp_digit* a1 = z1;
-    sp_digit* z2 = r + 196;
-    (void)sp_4096_add_98(a1, a, &a[98]);
-    sp_4096_sqr_98(z2, &a[98]);
-    sp_4096_sqr_98(z0, a);
-    sp_4096_sqr_98(z1, a1);
-    (void)sp_4096_sub_196(z1, z1, z2);
-    (void)sp_4096_sub_196(z1, z1, z0);
-    (void)sp_4096_add_196(r + 98, r + 98, z1);
-}
-
-#endif /* !WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_add_196(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 196; i++) {
-        r[i] = a[i] + b[i];
-    }
-
-    return 0;
-}
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_sub_196(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 196; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_mul_196(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int k;
-    int64_t c;
-
-    c = ((int64_t)a[195]) * b[195];
-    r[391] = (sp_digit)(c >> 21);
-    c = (c & 0x1fffff) << 21;
-    for (k = 389; k >= 0; k--) {
-        for (i = 195; i >= 0; i--) {
-            j = k - i;
-            if (j >= 196) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
-        }
-        r[k + 2] += (sp_digit)(c >> 42);
-        r[k + 1] = (sp_digit)((c >> 21) & 0x1fffff);
-        c = (c & 0x1fffff) << 21;
-    }
-    r[0] = (sp_digit)(c >> 21);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_sqr_196(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int64_t c;
-
-    c = ((int64_t)a[195]) * a[195];
-    r[391] = (sp_digit)(c >> 21);
-    c = (c & 0x1fffff) << 21;
-    for (k = 389; k >= 0; k--) {
-        for (i = 195; i >= 0; i--) {
-            j = k - i;
-            if (j >= 196 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * a[j] * 2;
-        }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 42);
-        r[k + 1] = (sp_digit)((c >> 21) & 0x1fffff);
-        c = (c & 0x1fffff) << 21;
-    }
-    r[0] = (sp_digit)(c >> 21);
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
 #if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
+/* Normalize the values in each word to 29 bits.
  *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
+ * a  Array of sp_digit to normalize.
  */
-SP_NOINLINE static int sp_4096_add_98(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+static void sp_4096_norm_71(sp_digit* a)
 {
     int i;
-
-    for (i = 0; i < 98; i++) {
-        r[i] = a[i] + b[i];
+    for (i = 0; i < 70; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
     }
-
-    return 0;
 }
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
+
+#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
+/* Normalize the values in each word to 29 bits.
  *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
+ * a  Array of sp_digit to normalize.
  */
-SP_NOINLINE static int sp_4096_sub_98(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+static void sp_4096_norm_142(sp_digit* a)
 {
     int i;
-
-    for (i = 0; i < 98; i++) {
-        r[i] = a[i] - b[i];
+    for (i = 0; i < 141; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
     }
-
-    return 0;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
  * r  A single precision integer.
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_4096_mul_98(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_4096_mul_142(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 lo;
 
-    c = ((int64_t)a[97]) * b[97];
-    r[195] = (sp_digit)(c >> 21);
-    c = (c & 0x1fffff) << 21;
-    for (k = 193; k >= 0; k--) {
-        for (i = 97; i >= 0; i--) {
-            j = k - i;
-            if (j >= 98) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
+    c = ((sp_uint64)a[141]) * b[141];
+    r[283] = (sp_digit)(c >> 29);
+    c &= 0x1fffffff;
+    for (k = 281; k >= 0; k--) {
+        if (k >= 142) {
+            i = k - 141;
+            imax = 141;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        if (imax - i > 15) {
+            int imaxlo;
+            lo = 0;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
+                for (; i <= imax && i < imaxlo + 15; i++) {
+                    lo += ((sp_uint64)a[i]) * b[k - i];
+                }
+                c += lo >> 29;
+                lo &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
+        }
+        else {
+            lo = 0;
+            for (; i <= imax; i++) {
+                lo += ((sp_uint64)a[i]) * b[k - i];
+            }
+            c += lo >> 29;
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
         }
-        r[k + 2] += (sp_digit)(c >> 42);
-        r[k + 1] = (sp_digit)((c >> 21) & 0x1fffff);
-        c = (c & 0x1fffff) << 21;
     }
-    r[0] = (sp_digit)(c >> 21);
+    r[0] = (sp_digit)c;
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -9167,43 +12780,65 @@ SP_NOINLINE static void sp_4096_mul_98(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_4096_sqr_98(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_4096_sqr_142(sp_digit* r, const sp_digit* a)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 t;
 
-    c = ((int64_t)a[97]) * a[97];
-    r[195] = (sp_digit)(c >> 21);
-    c = (c & 0x1fffff) << 21;
-    for (k = 193; k >= 0; k--) {
-        for (i = 97; i >= 0; i--) {
-            j = k - i;
-            if (j >= 98 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * a[j] * 2;
+    c = ((sp_uint64)a[141]) * a[141];
+    r[283] = (sp_digit)(c >> 29);
+    c = (c & 0x1fffffff) << 29;
+    for (k = 281; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
         }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
+        if (k < 141) {
+            imax = k;
         }
+        else {
+            imax = 141;
+        }
+        if (imax - i >= 14) {
+            int imaxlo;
+            sp_uint64 hi;
 
-        r[k + 2] += (sp_digit)(c >> 42);
-        r[k + 1] = (sp_digit)((c >> 21) & 0x1fffff);
-        c = (c & 0x1fffff) << 21;
+            hi = c >> 29;
+            c &= 0x1fffffff;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
+                t = 0;
+                for (; i <= imax && i < imaxlo + 14; i++) {
+                    t += ((sp_uint64)a[i]) * a[k - i];
+                }
+                c += t * 2;
+
+                hi += c >> 29;
+                c &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(hi >> 29);
+            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
+            c <<= 29;
+        }
+        else
+        {
+            t = 0;
+            for (; i <= imax; i++) {
+                t += ((sp_uint64)a[i]) * a[k - i];
+            }
+            c += t * 2;
+
+            r[k + 2] += (sp_digit) (c >> 58);
+            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
+            c = (c & 0x1fffffff) << 29;
+        }
     }
-    r[0] = (sp_digit)(c >> 21);
+    r[0] = (sp_digit)(c >> 29);
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
-#endif /* (WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH) & !WOLFSSL_RSA_PUBLIC_ONLY */
-
 /* Caclulate the bottom digit of -1/a mod 2^n.
  *
  * a    A single precision number.
@@ -9219,10 +12854,10 @@ static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
     x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
     x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
     x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
-    x &= 0x1fffff;
+    x &= 0x1fffffff;
 
     /* rho = -1/m mod b */
-    *rho = ((sp_digit)1 << 21) - x;
+    *rho = ((sp_digit)1 << 29) - x;
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -9231,89 +12866,59 @@ static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_4096_mul_d_196(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_4096_mul_d_142(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 196; i++) {
+    for (i = 0; i < 142; i++) {
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
     }
-    r[196] = (sp_digit)t;
-#else
-    int64_t tb = b;
-    int64_t t = 0;
-    sp_digit t2;
-    int64_t p[4];
-    int i;
-
-    for (i = 0; i < 196; i += 4) {
-        p[0] = tb * a[i + 0];
-        p[1] = tb * a[i + 1];
-        p[2] = tb * a[i + 2];
-        p[3] = tb * a[i + 3];
-        t += p[0];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 0] = (sp_digit)t2;
-        t += p[1];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 1] = (sp_digit)t2;
-        t += p[2];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 2] = (sp_digit)t2;
-        t += p[3];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 3] = (sp_digit)t2;
-    }
-    r[196] = (sp_digit)(t & 0x1fffff);
-#endif /* WOLFSSL_SP_SMALL */
+    r[142] = (sp_digit)t;
 }
 
 #if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
 #if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_sub_71(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 71; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
 /* r = 2^n mod m where n is the number of bits to reduce by.
  * Given m must be 4096 bits, just need to subtract.
  *
  * r  A single precision number.
  * m  A single precision number.
  */
-static void sp_4096_mont_norm_98(sp_digit* r, const sp_digit* m)
+static void sp_4096_mont_norm_71(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=0; i<97; i++) {
-        r[i] = 0x1fffff;
+    for (i=0; i<70; i++) {
+        r[i] = 0x1fffffff;
     }
-#else
-    int i;
-
-    for (i = 0; i < 96; i += 8) {
-        r[i + 0] = 0x1fffff;
-        r[i + 1] = 0x1fffff;
-        r[i + 2] = 0x1fffff;
-        r[i + 3] = 0x1fffff;
-        r[i + 4] = 0x1fffff;
-        r[i + 5] = 0x1fffff;
-        r[i + 6] = 0x1fffff;
-        r[i + 7] = 0x1fffff;
-    }
-    r[96] = 0x1fffff;
-#endif
-    r[97] = 0x7ffL;
+    r[70] = 0x3ffffL;
 
     /* r = (2^n - 1) mod n */
-    (void)sp_4096_sub_98(r, r, m);
+    (void)sp_4096_sub_71(r, r, m);
 
     /* Add one so r = 2^n mod m */
     r[0] += 1;
@@ -9326,31 +12931,14 @@ static void sp_4096_mont_norm_98(sp_digit* r, const sp_digit* m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static sp_digit sp_4096_cmp_98(const sp_digit* a, const sp_digit* b)
+static sp_digit sp_4096_cmp_71(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=97; i>=0; i--) {
+    for (i=70; i>=0; i--) {
         r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#else
-    int i;
-
-    r |= (a[97] - b[97]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[96] - b[96]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    for (i = 88; i >= 0; i -= 8) {
-        r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 4] - b[i + 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 3] - b[i + 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 2] - b[i + 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -9363,31 +12951,14 @@ static sp_digit sp_4096_cmp_98(const sp_digit* a, const sp_digit* b)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static void sp_4096_cond_sub_98(sp_digit* r, const sp_digit* a,
+static void sp_4096_cond_sub_71(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 98; i++) {
+    for (i = 0; i < 71; i++) {
         r[i] = a[i] - (b[i] & m);
     }
-#else
-    int i;
-
-    for (i = 0; i < 96; i += 8) {
-        r[i + 0] = a[i + 0] - (b[i + 0] & m);
-        r[i + 1] = a[i + 1] - (b[i + 1] & m);
-        r[i + 2] = a[i + 2] - (b[i + 2] & m);
-        r[i + 3] = a[i + 3] - (b[i + 3] & m);
-        r[i + 4] = a[i + 4] - (b[i + 4] & m);
-        r[i + 5] = a[i + 5] - (b[i + 5] & m);
-        r[i + 6] = a[i + 6] - (b[i + 6] & m);
-        r[i + 7] = a[i + 7] - (b[i + 7] & m);
-    }
-    r[96] = a[96] - (b[96] & m);
-    r[97] = a[97] - (b[97] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -9396,76 +12967,51 @@ static void sp_4096_cond_sub_98(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_4096_mul_add_98(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_4096_mul_add_71(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 98; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1fffff;
-        t >>= 21;
+    for (i = 0; i < 71; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0x1fffffff;
+        t >>= 29;
     }
-    r[98] += (sp_digit)t;
+    r[71] += (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t[8];
+    sp_int64 tb = b;
+    sp_int64 t[4];
     int i;
 
-    t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1fffff);
-    for (i = 0; i < 96; i += 8) {
-        t[1] = tb * a[i+1];
-        r[i+1] += (sp_digit)((t[0] >> 21) + (t[1] & 0x1fffff));
-        t[2] = tb * a[i+2];
-        r[i+2] += (sp_digit)((t[1] >> 21) + (t[2] & 0x1fffff));
-        t[3] = tb * a[i+3];
-        r[i+3] += (sp_digit)((t[2] >> 21) + (t[3] & 0x1fffff));
-        t[4] = tb * a[i+4];
-        r[i+4] += (sp_digit)((t[3] >> 21) + (t[4] & 0x1fffff));
-        t[5] = tb * a[i+5];
-        r[i+5] += (sp_digit)((t[4] >> 21) + (t[5] & 0x1fffff));
-        t[6] = tb * a[i+6];
-        r[i+6] += (sp_digit)((t[5] >> 21) + (t[6] & 0x1fffff));
-        t[7] = tb * a[i+7];
-        r[i+7] += (sp_digit)((t[6] >> 21) + (t[7] & 0x1fffff));
-        t[0] = tb * a[i+8];
-        r[i+8] += (sp_digit)((t[7] >> 21) + (t[0] & 0x1fffff));
+    t[0] = 0;
+    for (i = 0; i < 68; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[0]  = t[3] >> 29;
     }
-    t[1] = tb * a[97];
-    r[97] += (sp_digit)((t[0] >> 21) + (t[1] & 0x1fffff));
-    r[98] +=  (sp_digit)(t[1] >> 21);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 21.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_4096_norm_98(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 97; i++) {
-        a[i+1] += a[i] >> 21;
-        a[i] &= 0x1fffff;
-    }
-#else
-    int i;
-    for (i = 0; i < 96; i += 8) {
-        a[i+1] += a[i+0] >> 21; a[i+0] &= 0x1fffff;
-        a[i+2] += a[i+1] >> 21; a[i+1] &= 0x1fffff;
-        a[i+3] += a[i+2] >> 21; a[i+2] &= 0x1fffff;
-        a[i+4] += a[i+3] >> 21; a[i+3] &= 0x1fffff;
-        a[i+5] += a[i+4] >> 21; a[i+4] &= 0x1fffff;
-        a[i+6] += a[i+5] >> 21; a[i+5] &= 0x1fffff;
-        a[i+7] += a[i+6] >> 21; a[i+6] &= 0x1fffff;
-        a[i+8] += a[i+7] >> 21; a[i+7] &= 0x1fffff;
-    }
-    a[96+1] += a[96] >> 21; a[96] &= 0x1fffff;
-#endif
+    t[0] += (tb * a[68]) + r[68];
+    t[1]  = (tb * a[69]) + r[69];
+    t[2]  = (tb * a[70]) + r[70];
+    r[68] = t[0] & 0x1fffffff;
+    t[1] += t[0] >> 29;
+    r[69] = t[1] & 0x1fffffff;
+    t[2] += t[1] >> 29;
+    r[70] = t[2] & 0x1fffffff;
+    r[71] +=  (sp_digit)(t[2] >> 29);
+#endif /* !WOLFSSL_SP_LARGE_CODE */
 }
 
 /* Shift the result in the high 2048 bits down to the bottom.
@@ -9473,45 +13019,19 @@ static void sp_4096_norm_98(sp_digit* a)
  * r  A single precision number.
  * a  A single precision number.
  */
-static void sp_4096_mont_shift_98(sp_digit* r, const sp_digit* a)
+static void sp_4096_mont_shift_71(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
-    int64_t n = a[97] >> 11;
-    n += ((int64_t)a[98]) << 10;
+    sp_int64 n = a[70] >> 18;
+    n += ((sp_int64)a[71]) << 11;
 
-    for (i = 0; i < 97; i++) {
-        r[i] = n & 0x1fffff;
-        n >>= 21;
-        n += ((int64_t)a[99 + i]) << 10;
+    for (i = 0; i < 70; i++) {
+        r[i] = n & 0x1fffffff;
+        n >>= 29;
+        n += ((sp_int64)a[72 + i]) << 11;
     }
-    r[97] = (sp_digit)n;
-#else
-    int i;
-    int64_t n = a[97] >> 11;
-    n += ((int64_t)a[98]) << 10;
-    for (i = 0; i < 96; i += 8) {
-        r[i + 0] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 99]) << 10;
-        r[i + 1] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 100]) << 10;
-        r[i + 2] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 101]) << 10;
-        r[i + 3] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 102]) << 10;
-        r[i + 4] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 103]) << 10;
-        r[i + 5] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 104]) << 10;
-        r[i + 6] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 105]) << 10;
-        r[i + 7] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 106]) << 10;
-    }
-    r[96] = n & 0x1fffff; n >>= 21; n += ((int64_t)a[195]) << 10;
-    r[97] = (sp_digit)n;
-#endif /* WOLFSSL_SP_SMALL */
-    XMEMSET(&r[98], 0, sizeof(*r) * 98U);
+    r[70] = (sp_digit)n;
+    XMEMSET(&r[71], 0, sizeof(*r) * 71U);
 }
 
 /* Reduce the number back to 4096 bits using Montgomery reduction.
@@ -9520,26 +13040,81 @@ static void sp_4096_mont_shift_98(sp_digit* r, const sp_digit* a)
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static void sp_4096_mont_reduce_98(sp_digit* a, const sp_digit* m, sp_digit mp)
+static void sp_4096_mont_reduce_71(sp_digit* a, const sp_digit* m, sp_digit mp)
 {
     int i;
     sp_digit mu;
 
-    sp_4096_norm_98(a + 98);
+    sp_4096_norm_71(a + 71);
 
-    for (i=0; i<97; i++) {
-        mu = (a[i] * mp) & 0x1fffff;
-        sp_4096_mul_add_98(a+i, m, mu);
-        a[i+1] += a[i] >> 21;
+    for (i=0; i<70; i++) {
+        mu = (a[i] * mp) & 0x1fffffff;
+        sp_4096_mul_add_71(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
     }
-    mu = (a[i] * mp) & 0x7ffL;
-    sp_4096_mul_add_98(a+i, m, mu);
-    a[i+1] += a[i] >> 21;
-    a[i] &= 0x1fffff;
-    sp_4096_mont_shift_98(a, a);
-    sp_4096_cond_sub_98(a, a, m, 0 - (((a[97] >> 11) > 0) ?
+    mu = (a[i] * mp) & 0x3ffffL;
+    sp_4096_mul_add_71(a+i, m, mu);
+    a[i+1] += a[i] >> 29;
+    a[i] &= 0x1fffffff;
+    sp_4096_mont_shift_71(a, a);
+    sp_4096_cond_sub_71(a, a, m, 0 - (((a[70] - m[70]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
-    sp_4096_norm_98(a);
+    sp_4096_norm_71(a);
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_mul_71(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint64 c;
+    sp_uint64 lo;
+
+    c = ((sp_uint64)a[70]) * b[70];
+    r[141] = (sp_digit)(c >> 29);
+    c &= 0x1fffffff;
+    for (k = 139; k >= 0; k--) {
+        if (k >= 71) {
+            i = k - 70;
+            imax = 70;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        if (imax - i > 15) {
+            int imaxlo;
+            lo = 0;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
+                for (; i <= imax && i < imaxlo + 15; i++) {
+                    lo += ((sp_uint64)a[i]) * b[k - i];
+                }
+                c += lo >> 29;
+                lo &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
+        }
+        else {
+            lo = 0;
+            for (; i <= imax; i++) {
+                lo += ((sp_uint64)a[i]) * b[k - i];
+            }
+            c += lo >> 29;
+            r[k + 2] += (sp_digit)(c >> 29);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+            c = lo & 0x1fffffff;
+        }
+    }
+    r[0] = (sp_digit)c;
 }
 
 /* Multiply two Montogmery form numbers mod the modulus (prime).
@@ -9551,11 +13126,75 @@ static void sp_4096_mont_reduce_98(sp_digit* a, const sp_digit* m, sp_digit mp)
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_4096_mont_mul_98(sp_digit* r, const sp_digit* a,
+static void sp_4096_mont_mul_71(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit* m, sp_digit mp)
 {
-    sp_4096_mul_98(r, a, b);
-    sp_4096_mont_reduce_98(r, m, mp);
+    sp_4096_mul_71(r, a, b);
+    sp_4096_mont_reduce_71(r, m, mp);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_sqr_71(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint64 c;
+    sp_uint64 t;
+
+    c = ((sp_uint64)a[70]) * a[70];
+    r[141] = (sp_digit)(c >> 29);
+    c = (c & 0x1fffffff) << 29;
+    for (k = 139; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
+        }
+        if (k < 70) {
+            imax = k;
+        }
+        else {
+            imax = 70;
+        }
+        if (imax - i >= 14) {
+            int imaxlo;
+            sp_uint64 hi;
+
+            hi = c >> 29;
+            c &= 0x1fffffff;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
+                t = 0;
+                for (; i <= imax && i < imaxlo + 14; i++) {
+                    t += ((sp_uint64)a[i]) * a[k - i];
+                }
+                c += t * 2;
+
+                hi += c >> 29;
+                c &= 0x1fffffff;
+            }
+            r[k + 2] += (sp_digit)(hi >> 29);
+            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
+            c <<= 29;
+        }
+        else
+        {
+            t = 0;
+            for (; i <= imax; i++) {
+                t += ((sp_uint64)a[i]) * a[k - i];
+            }
+            c += t * 2;
+
+            r[k + 2] += (sp_digit) (c >> 58);
+            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
+            c = (c & 0x1fffffff) << 29;
+        }
+    }
+    r[0] = (sp_digit)(c >> 29);
 }
 
 /* Square the Montgomery form number. (r = a * a mod m)
@@ -9565,11 +13204,11 @@ static void sp_4096_mont_mul_98(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_4096_mont_sqr_98(sp_digit* r, const sp_digit* a,
+static void sp_4096_mont_sqr_71(sp_digit* r, const sp_digit* a,
         const sp_digit* m, sp_digit mp)
 {
-    sp_4096_sqr_98(r, a);
-    sp_4096_mont_reduce_98(r, m, mp);
+    sp_4096_sqr_71(r, a);
+    sp_4096_mont_reduce_71(r, m, mp);
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -9578,57 +13217,19 @@ static void sp_4096_mont_sqr_98(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_4096_mul_d_98(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_4096_mul_d_71(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 98; i++) {
+    for (i = 0; i < 71; i++) {
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
     }
-    r[98] = (sp_digit)t;
-#else
-    int64_t tb = b;
-    int64_t t = 0;
-    sp_digit t2;
-    int64_t p[4];
-    int i;
-
-    for (i = 0; i < 96; i += 4) {
-        p[0] = tb * a[i + 0];
-        p[1] = tb * a[i + 1];
-        p[2] = tb * a[i + 2];
-        p[3] = tb * a[i + 3];
-        t += p[0];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 0] = (sp_digit)t2;
-        t += p[1];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 1] = (sp_digit)t2;
-        t += p[2];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 2] = (sp_digit)t2;
-        t += p[3];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 3] = (sp_digit)t2;
-    }
-    t += tb * a[96];
-    r[96] = (sp_digit)(t & 0x1fffff);
-    t >>= 21;
-    t += tb * a[97];
-    r[97] = (sp_digit)(t & 0x1fffff);
-    t >>= 21;
-    r[98] = (sp_digit)(t & 0x1fffff);
-#endif /* WOLFSSL_SP_SMALL */
+    r[71] = (sp_digit)t;
 }
 
 /* Conditionally add a and b using the mask m.
@@ -9639,79 +13240,162 @@ SP_NOINLINE static void sp_4096_mul_d_98(sp_digit* r, const sp_digit* a,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static void sp_4096_cond_add_98(sp_digit* r, const sp_digit* a,
+static void sp_4096_cond_add_71(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 98; i++) {
+    for (i = 0; i < 71; i++) {
         r[i] = a[i] + (b[i] & m);
     }
-#else
-    int i;
-
-    for (i = 0; i < 96; i += 8) {
-        r[i + 0] = a[i + 0] + (b[i + 0] & m);
-        r[i + 1] = a[i + 1] + (b[i + 1] & m);
-        r[i + 2] = a[i + 2] + (b[i + 2] & m);
-        r[i + 3] = a[i + 3] + (b[i + 3] & m);
-        r[i + 4] = a[i + 4] + (b[i + 4] & m);
-        r[i + 5] = a[i + 5] + (b[i + 5] & m);
-        r[i + 6] = a[i + 6] + (b[i + 6] & m);
-        r[i + 7] = a[i + 7] + (b[i + 7] & m);
-    }
-    r[96] = a[96] + (b[96] & m);
-    r[97] = a[97] + (b[97] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
-SP_NOINLINE static void sp_4096_rshift_98(sp_digit* r, const sp_digit* a,
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_71(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 71; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_4096_rshift_71(sp_digit* r, const sp_digit* a,
         byte n)
 {
     int i;
 
-#ifdef WOLFSSL_SP_SMALL
-    for (i=0; i<97; i++) {
-        r[i] = ((a[i] >> n) | (a[i + 1] << (21 - n))) & 0x1fffff;
+    for (i=0; i<70; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
     }
-#else
-    for (i=0; i<96; i += 8) {
-        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (21 - n)) & 0x1fffff);
-        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (21 - n)) & 0x1fffff);
-        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (21 - n)) & 0x1fffff);
-        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (21 - n)) & 0x1fffff);
-        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (21 - n)) & 0x1fffff);
-        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (21 - n)) & 0x1fffff);
-        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (21 - n)) & 0x1fffff);
-        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (21 - n)) & 0x1fffff);
-    }
-    r[96] = (a[96] >> n) | ((a[97] << (21 - n)) & 0x1fffff);
-#endif
-    r[97] = a[97] >> n;
+    r[70] = a[70] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_32
-static WC_INLINE sp_digit sp_4096_div_word_98(sp_digit d1, sp_digit d0,
+static WC_INLINE sp_digit sp_4096_div_word_71(sp_digit d1, sp_digit d0,
     sp_digit dv)
 {
     sp_digit d;
     sp_digit r;
     sp_digit t;
 
-    /* All 21 bits from d1 and top 10 bits from d0. */
-    d = (d1 << 10) + (d0 >> 11);
+    /* All 29 bits from d1 and top 2 bits from d0. */
+    d = (d1 << 2) + (d0 >> 27);
     r = d / dv;
     d -= r * dv;
+    /* Up to 3 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 25) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 5 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 23) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 21) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 9 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 19) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
     /* Up to 11 bits in r */
-    /* Next 10 bits from d0. */
-    r <<= 10;
-    d <<= 10;
-    d += (d0 >> 1) & ((1 << 10) - 1);
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 17) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 15) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 15 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 13) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 17 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 11) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 9) & ((1 << 2) - 1);
     t = d / dv;
     d -= t * dv;
     r += t;
     /* Up to 21 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 7) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 23 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 5) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 3) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 27 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 1) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
     /* Remaining 1 bits from d0. */
     r <<= 1;
     d <<= 1;
@@ -9719,7 +13403,7 @@ static WC_INLINE sp_digit sp_4096_div_word_98(sp_digit d1, sp_digit d0,
     t = d / dv;
     r += t;
 
-    /* All 21 bits from d1 and top 10 bits from d0. */
+    /* All 29 bits from d1 and top 2 bits from d0. */
     return r;
 }
 #endif /* WOLFSSL_SP_DIV_32 */
@@ -9735,19 +13419,19 @@ static WC_INLINE sp_digit sp_4096_div_word_98(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_4096_div_98(const sp_digit* a, const sp_digit* d,
+static int sp_4096_div_71(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
+    sp_int64 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[4 * 98 + 3];
+    sp_digit t1[4 * 71 + 3];
 #endif
     sp_digit* t2 = NULL;
     sp_digit* sd = NULL;
@@ -9756,7 +13440,7 @@ static int sp_4096_div_98(const sp_digit* a, const sp_digit* d,
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 98 + 3), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 71 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
@@ -9765,53 +13449,60 @@ static int sp_4096_div_98(const sp_digit* a, const sp_digit* d,
     (void)m;
 
     if (err == MP_OKAY) {
-        t2 = t1 + 196 + 1;
-        sd = t2 + 98 + 1;
+        t2 = t1 + 142 + 1;
+        sd = t2 + 71 + 1;
 
-        sp_4096_mul_d_98(sd, d, (sp_digit)1 << 10);
-        sp_4096_mul_d_196(t1, a, (sp_digit)1 << 10);
-        dv = sd[97];
-        t1[98 + 98] += t1[98 + 98 - 1] >> 21;
-        t1[98 + 98 - 1] &= 0x1fffff;
-        for (i=98; i>=0; i--) {
+        sp_4096_mul_d_71(sd, d, (sp_digit)1 << 11);
+        sp_4096_mul_d_142(t1, a, (sp_digit)1 << 11);
+        dv = sd[70];
+        t1[71 + 71] += t1[71 + 71 - 1] >> 29;
+        t1[71 + 71 - 1] &= 0x1fffffff;
+        for (i=71; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_32
-            d1 = t1[98 + i];
-            d1 <<= 21;
-            d1 += t1[98 + i - 1];
+            d1 = t1[71 + i];
+            d1 <<= 29;
+            d1 += t1[71 + i - 1];
             r1 = (sp_digit)(d1 / dv);
 #else
-            r1 = sp_4096_div_word_98(t1[98 + i], t1[98 + i - 1], dv);
+            r1 = sp_4096_div_word_71(t1[71 + i], t1[71 + i - 1], dv);
 #endif
 
-            sp_4096_mul_d_98(t2, sd, r1);
-            (void)sp_4096_sub_98(&t1[i], &t1[i], t2);
-            sp_4096_norm_98(&t1[i]);
-            t1[98 + i] -= t2[98];
-            t1[98 + i] += t1[98 + i - 1] >> 21;
-            t1[98 + i - 1] &= 0x1fffff;
-            r1 = (((-t1[98 + i]) << 21) - t1[98 + i - 1]) / dv;
-            r1 -= t1[98 + i];
-            sp_4096_mul_d_98(t2, sd, r1);
-            (void)sp_4096_add_98(&t1[i], &t1[i], t2);
-            t1[98 + i] += t1[98 + i - 1] >> 21;
-            t1[98 + i - 1] &= 0x1fffff;
+            sp_4096_mul_d_71(t2, sd, r1);
+            (void)sp_4096_sub_71(&t1[i], &t1[i], t2);
+            sp_4096_norm_71(&t1[i]);
+            t1[71 + i] -= t2[71];
+            t1[71 + i] += t1[71 + i - 1] >> 29;
+            t1[71 + i - 1] &= 0x1fffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[71 + i];
+            d1 <<= 29;
+            d1 -= t1[71 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_71(-t1[71 + i], -t1[71 + i - 1], dv);
+#endif
+            r1 -= t1[71 + i];
+            sp_4096_mul_d_71(t2, sd, r1);
+            (void)sp_4096_add_71(&t1[i], &t1[i], t2);
+            t1[71 + i] += t1[71 + i - 1] >> 29;
+            t1[71 + i - 1] &= 0x1fffffff;
         }
-        t1[98 - 1] += t1[98 - 2] >> 21;
-        t1[98 - 2] &= 0x1fffff;
-        r1 = t1[98 - 1] / dv;
+        t1[71 - 1] += t1[71 - 2] >> 29;
+        t1[71 - 2] &= 0x1fffffff;
+        r1 = t1[71 - 1] / dv;
 
-        sp_4096_mul_d_98(t2, sd, r1);
-        sp_4096_sub_98(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 196U);
-        for (i=0; i<97; i++) {
-            r[i+1] += r[i] >> 21;
-            r[i] &= 0x1fffff;
+        sp_4096_mul_d_71(t2, sd, r1);
+        sp_4096_sub_71(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 142U);
+        for (i=0; i<70; i++) {
+            r[i+1] += r[i] >> 29;
+            r[i] &= 0x1fffffff;
         }
-        sp_4096_cond_add_98(r, r, sd, 0 - ((r[97] < 0) ?
+        sp_4096_cond_add_71(r, r, sd, 0 - ((r[70] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
 
-        sp_4096_norm_98(r);
-        sp_4096_rshift_98(r, r, 10);
+        sp_4096_norm_71(r);
+        sp_4096_rshift_71(r, r, 11);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -9829,9 +13520,9 @@ static int sp_4096_div_98(const sp_digit* a, const sp_digit* d,
  * m  A single precision number that is the modulus to reduce with.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_4096_mod_98(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static int sp_4096_mod_71(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
-    return sp_4096_div_98(a, m, NULL, r);
+    return sp_4096_div_71(a, m, NULL, r);
 }
 
 /* Modular exponentiate a to the e mod m. (r = a^e mod m)
@@ -9843,14 +13534,14 @@ static int sp_4096_mod_98(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * m     A single precision number that is the modulus.
  * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
  */
-static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
+static int sp_4096_mod_exp_71(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int bits, const sp_digit* m, int reduceA)
 {
 #if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 196];
+    sp_digit td[3 * 142];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -9862,7 +13553,7 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 98 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 71 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -9871,29 +13562,29 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 98 * 2);
-            XMEMSET(t[i], 0, sizeof(sp_digit) * 98U * 2U);
+            t[i] = td + (i * 71 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 71U * 2U);
         }
 
         sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_98(norm, m);
+        sp_4096_mont_norm_71(norm, m);
 
         if (reduceA != 0) {
-            err = sp_4096_mod_98(t[1], a, m);
+            err = sp_4096_mod_71(t[1], a, m);
         }
         else {
-            XMEMCPY(t[1], a, sizeof(sp_digit) * 98U);
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 71U);
         }
     }
     if (err == MP_OKAY) {
-        sp_4096_mul_98(t[1], t[1], norm);
-        err = sp_4096_mod_98(t[1], t[1], m);
+        sp_4096_mul_71(t[1], t[1], norm);
+        err = sp_4096_mod_71(t[1], t[1], m);
     }
 
     if (err == MP_OKAY) {
-        i = bits / 21;
-        c = bits % 21;
-        n = e[i--] << (21 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -9901,28 +13592,28 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
                 }
 
                 n = e[i--];
-                c = 21;
+                c = 29;
             }
 
-            y = (int)((n >> 20) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_4096_mont_mul_98(t[y^1], t[0], t[1], m, mp);
+            sp_4096_mont_mul_71(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 98 * 2);
-            sp_4096_mont_sqr_98(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 71 * 2);
+            sp_4096_mont_sqr_71(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 98 * 2);
+                            sizeof(*t[2]) * 71 * 2);
         }
 
-        sp_4096_mont_reduce_98(t[0], m, mp);
-        n = sp_4096_cmp_98(t[0], m);
-        sp_4096_cond_sub_98(t[0], t[0], m, ((n < 0) ?
+        sp_4096_mont_reduce_71(t[0], m, mp);
+        n = sp_4096_cmp_71(t[0], m);
+        sp_4096_cond_sub_71(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 98 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 71 * 2);
 
     }
 
@@ -9936,7 +13627,7 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 196];
+    sp_digit td[3 * 142];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -9948,7 +13639,7 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 98 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 71 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -9957,29 +13648,29 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 98 * 2);
+            t[i] = td + (i * 71 * 2);
         }
 
         sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_98(norm, m);
+        sp_4096_mont_norm_71(norm, m);
 
         if (reduceA != 0) {
-            err = sp_4096_mod_98(t[1], a, m);
+            err = sp_4096_mod_71(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_4096_mul_98(t[1], t[1], norm);
-                err = sp_4096_mod_98(t[1], t[1], m);
+                sp_4096_mul_71(t[1], t[1], norm);
+                err = sp_4096_mod_71(t[1], t[1], m);
             }
         }
         else {
-            sp_4096_mul_98(t[1], a, norm);
-            err = sp_4096_mod_98(t[1], t[1], m);
+            sp_4096_mul_71(t[1], a, norm);
+            err = sp_4096_mod_71(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        i = bits / 21;
-        c = bits % 21;
-        n = e[i--] << (21 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -9987,28 +13678,28 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
                 }
 
                 n = e[i--];
-                c = 21;
+                c = 29;
             }
 
-            y = (int)((n >> 20) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_4096_mont_mul_98(t[y^1], t[0], t[1], m, mp);
+            sp_4096_mont_mul_71(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 98 * 2);
-            sp_4096_mont_sqr_98(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 71 * 2);
+            sp_4096_mont_sqr_71(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 98 * 2);
+                            sizeof(*t[2]) * 71 * 2);
         }
 
-        sp_4096_mont_reduce_98(t[0], m, mp);
-        n = sp_4096_cmp_98(t[0], m);
-        sp_4096_cond_sub_98(t[0], t[0], m, ((n < 0) ?
+        sp_4096_mont_reduce_71(t[0], m, mp);
+        n = sp_4096_cmp_71(t[0], m);
+        sp_4096_cond_sub_71(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 98 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 71 * 2);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -10021,7 +13712,7 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[(32 * 196) + 196];
+    sp_digit td[(32 * 142) + 142];
 #endif
     sp_digit* t[32];
     sp_digit* rt = NULL;
@@ -10034,7 +13725,7 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 196) + 196), NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 142) + 142), NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -10043,64 +13734,64 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<32; i++)
-            t[i] = td + i * 196;
-        rt = td + 6272;
+            t[i] = td + i * 142;
+        rt = td + 4544;
 
         sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_98(norm, m);
+        sp_4096_mont_norm_71(norm, m);
 
         if (reduceA != 0) {
-            err = sp_4096_mod_98(t[1], a, m);
+            err = sp_4096_mod_71(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_4096_mul_98(t[1], t[1], norm);
-                err = sp_4096_mod_98(t[1], t[1], m);
+                sp_4096_mul_71(t[1], t[1], norm);
+                err = sp_4096_mod_71(t[1], t[1], m);
             }
         }
         else {
-            sp_4096_mul_98(t[1], a, norm);
-            err = sp_4096_mod_98(t[1], t[1], m);
+            sp_4096_mul_71(t[1], a, norm);
+            err = sp_4096_mod_71(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        sp_4096_mont_sqr_98(t[ 2], t[ 1], m, mp);
-        sp_4096_mont_mul_98(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_4096_mont_sqr_98(t[ 4], t[ 2], m, mp);
-        sp_4096_mont_mul_98(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_4096_mont_sqr_98(t[ 6], t[ 3], m, mp);
-        sp_4096_mont_mul_98(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_4096_mont_sqr_98(t[ 8], t[ 4], m, mp);
-        sp_4096_mont_mul_98(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_4096_mont_sqr_98(t[10], t[ 5], m, mp);
-        sp_4096_mont_mul_98(t[11], t[ 6], t[ 5], m, mp);
-        sp_4096_mont_sqr_98(t[12], t[ 6], m, mp);
-        sp_4096_mont_mul_98(t[13], t[ 7], t[ 6], m, mp);
-        sp_4096_mont_sqr_98(t[14], t[ 7], m, mp);
-        sp_4096_mont_mul_98(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_98(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_98(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_98(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_98(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_98(t[20], t[10], m, mp);
-        sp_4096_mont_mul_98(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_98(t[22], t[11], m, mp);
-        sp_4096_mont_mul_98(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_98(t[24], t[12], m, mp);
-        sp_4096_mont_mul_98(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_98(t[26], t[13], m, mp);
-        sp_4096_mont_mul_98(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_98(t[28], t[14], m, mp);
-        sp_4096_mont_mul_98(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_98(t[30], t[15], m, mp);
-        sp_4096_mont_mul_98(t[31], t[16], t[15], m, mp);
+        sp_4096_mont_sqr_71(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_71(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_71(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_71(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_71(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_71(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_4096_mont_sqr_71(t[ 8], t[ 4], m, mp);
+        sp_4096_mont_mul_71(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_4096_mont_sqr_71(t[10], t[ 5], m, mp);
+        sp_4096_mont_mul_71(t[11], t[ 6], t[ 5], m, mp);
+        sp_4096_mont_sqr_71(t[12], t[ 6], m, mp);
+        sp_4096_mont_mul_71(t[13], t[ 7], t[ 6], m, mp);
+        sp_4096_mont_sqr_71(t[14], t[ 7], m, mp);
+        sp_4096_mont_mul_71(t[15], t[ 8], t[ 7], m, mp);
+        sp_4096_mont_sqr_71(t[16], t[ 8], m, mp);
+        sp_4096_mont_mul_71(t[17], t[ 9], t[ 8], m, mp);
+        sp_4096_mont_sqr_71(t[18], t[ 9], m, mp);
+        sp_4096_mont_mul_71(t[19], t[10], t[ 9], m, mp);
+        sp_4096_mont_sqr_71(t[20], t[10], m, mp);
+        sp_4096_mont_mul_71(t[21], t[11], t[10], m, mp);
+        sp_4096_mont_sqr_71(t[22], t[11], m, mp);
+        sp_4096_mont_mul_71(t[23], t[12], t[11], m, mp);
+        sp_4096_mont_sqr_71(t[24], t[12], m, mp);
+        sp_4096_mont_mul_71(t[25], t[13], t[12], m, mp);
+        sp_4096_mont_sqr_71(t[26], t[13], m, mp);
+        sp_4096_mont_mul_71(t[27], t[14], t[13], m, mp);
+        sp_4096_mont_sqr_71(t[28], t[14], m, mp);
+        sp_4096_mont_mul_71(t[29], t[15], t[14], m, mp);
+        sp_4096_mont_sqr_71(t[30], t[15], m, mp);
+        sp_4096_mont_mul_71(t[31], t[16], t[15], m, mp);
 
         bits = ((bits + 4) / 5) * 5;
-        i = ((bits + 20) / 21) - 1;
-        c = bits % 21;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
         if (c == 0) {
-            c = 21;
+            c = 29;
         }
-        if (i < 98) {
+        if (i < 71) {
             n = e[i--] << (32 - c);
         }
         else {
@@ -10108,36 +13799,48 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
             i--;
         }
         if (c < 5) {
-            n |= e[i--] << (11 - c);
-            c += 21;
+            n |= e[i--] << (3 - c);
+            c += 29;
         }
         y = (int)((n >> 27) & 0x1f);
         n <<= 5;
         c -= 5;
-        XMEMCPY(rt, t[y], sizeof(sp_digit) * 196);
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 142);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (11 - c);
-                c += 21;
+            if (c >= 5) {
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c = 24;
+            }
+            else {
+                y = (byte)((n >> 27) & 0x1f);
+                n = e[i--] << 3;
+                c = 5 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
             }
-            y = (int)((n >> 27) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
-            sp_4096_mont_sqr_98(rt, rt, m, mp);
-            sp_4096_mont_sqr_98(rt, rt, m, mp);
-            sp_4096_mont_sqr_98(rt, rt, m, mp);
-            sp_4096_mont_sqr_98(rt, rt, m, mp);
-            sp_4096_mont_sqr_98(rt, rt, m, mp);
+            sp_4096_mont_sqr_71(rt, rt, m, mp);
+            sp_4096_mont_sqr_71(rt, rt, m, mp);
+            sp_4096_mont_sqr_71(rt, rt, m, mp);
+            sp_4096_mont_sqr_71(rt, rt, m, mp);
+            sp_4096_mont_sqr_71(rt, rt, m, mp);
 
-            sp_4096_mont_mul_98(rt, rt, t[y], m, mp);
+            sp_4096_mont_mul_71(rt, rt, t[y], m, mp);
         }
 
-        sp_4096_mont_reduce_98(rt, m, mp);
-        n = sp_4096_cmp_98(rt, m);
-        sp_4096_cond_sub_98(rt, rt, m, ((n < 0) ?
+        sp_4096_mont_reduce_71(rt, m, mp);
+        n = sp_4096_cmp_71(rt, m);
+        sp_4096_cond_sub_71(rt, rt, m, ((n < 0) ?
                    (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, rt, sizeof(sp_digit) * 196);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 142);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -10152,42 +13855,42 @@ static int sp_4096_mod_exp_98(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
 #endif /* (WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH) & !WOLFSSL_RSA_PUBLIC_ONLY */
 
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_sub_142(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 142; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
 /* r = 2^n mod m where n is the number of bits to reduce by.
  * Given m must be 4096 bits, just need to subtract.
  *
  * r  A single precision number.
  * m  A single precision number.
  */
-static void sp_4096_mont_norm_196(sp_digit* r, const sp_digit* m)
+static void sp_4096_mont_norm_142(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=0; i<195; i++) {
-        r[i] = 0x1fffff;
+    for (i=0; i<141; i++) {
+        r[i] = 0x1fffffff;
     }
-#else
-    int i;
-
-    for (i = 0; i < 192; i += 8) {
-        r[i + 0] = 0x1fffff;
-        r[i + 1] = 0x1fffff;
-        r[i + 2] = 0x1fffff;
-        r[i + 3] = 0x1fffff;
-        r[i + 4] = 0x1fffff;
-        r[i + 5] = 0x1fffff;
-        r[i + 6] = 0x1fffff;
-        r[i + 7] = 0x1fffff;
-    }
-    r[192] = 0x1fffff;
-    r[193] = 0x1fffff;
-    r[194] = 0x1fffff;
-#endif
-    r[195] = 0x1L;
+    r[141] = 0x7fL;
 
     /* r = (2^n - 1) mod n */
-    (void)sp_4096_sub_196(r, r, m);
+    (void)sp_4096_sub_142(r, r, m);
 
     /* Add one so r = 2^n mod m */
     r[0] += 1;
@@ -10200,33 +13903,14 @@ static void sp_4096_mont_norm_196(sp_digit* r, const sp_digit* m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-static sp_digit sp_4096_cmp_196(const sp_digit* a, const sp_digit* b)
+static sp_digit sp_4096_cmp_142(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i=195; i>=0; i--) {
+    for (i=141; i>=0; i--) {
         r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#else
-    int i;
-
-    r |= (a[195] - b[195]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[194] - b[194]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[193] - b[193]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[192] - b[192]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    for (i = 184; i >= 0; i -= 8) {
-        r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 4] - b[i + 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 3] - b[i + 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 2] - b[i + 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-        r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -10239,33 +13923,14 @@ static sp_digit sp_4096_cmp_196(const sp_digit* a, const sp_digit* b)
  * b  A single precision number to subtract.
  * m  Mask value to apply.
  */
-static void sp_4096_cond_sub_196(sp_digit* r, const sp_digit* a,
+static void sp_4096_cond_sub_142(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 196; i++) {
+    for (i = 0; i < 142; i++) {
         r[i] = a[i] - (b[i] & m);
     }
-#else
-    int i;
-
-    for (i = 0; i < 192; i += 8) {
-        r[i + 0] = a[i + 0] - (b[i + 0] & m);
-        r[i + 1] = a[i + 1] - (b[i + 1] & m);
-        r[i + 2] = a[i + 2] - (b[i + 2] & m);
-        r[i + 3] = a[i + 3] - (b[i + 3] & m);
-        r[i + 4] = a[i + 4] - (b[i + 4] & m);
-        r[i + 5] = a[i + 5] - (b[i + 5] & m);
-        r[i + 6] = a[i + 6] - (b[i + 6] & m);
-        r[i + 7] = a[i + 7] - (b[i + 7] & m);
-    }
-    r[192] = a[192] - (b[192] & m);
-    r[193] = a[193] - (b[193] & m);
-    r[194] = a[194] - (b[194] & m);
-    r[195] = a[195] - (b[195] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -10274,82 +13939,48 @@ static void sp_4096_cond_sub_196(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_4096_mul_add_196(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_4096_mul_add_142(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 196; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1fffff;
-        t >>= 21;
+    for (i = 0; i < 142; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0x1fffffff;
+        t >>= 29;
     }
-    r[196] += (sp_digit)t;
+    r[142] += (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t[8];
+    sp_int64 tb = b;
+    sp_int64 t[4];
     int i;
 
-    t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1fffff);
-    for (i = 0; i < 192; i += 8) {
-        t[1] = tb * a[i+1];
-        r[i+1] += (sp_digit)((t[0] >> 21) + (t[1] & 0x1fffff));
-        t[2] = tb * a[i+2];
-        r[i+2] += (sp_digit)((t[1] >> 21) + (t[2] & 0x1fffff));
-        t[3] = tb * a[i+3];
-        r[i+3] += (sp_digit)((t[2] >> 21) + (t[3] & 0x1fffff));
-        t[4] = tb * a[i+4];
-        r[i+4] += (sp_digit)((t[3] >> 21) + (t[4] & 0x1fffff));
-        t[5] = tb * a[i+5];
-        r[i+5] += (sp_digit)((t[4] >> 21) + (t[5] & 0x1fffff));
-        t[6] = tb * a[i+6];
-        r[i+6] += (sp_digit)((t[5] >> 21) + (t[6] & 0x1fffff));
-        t[7] = tb * a[i+7];
-        r[i+7] += (sp_digit)((t[6] >> 21) + (t[7] & 0x1fffff));
-        t[0] = tb * a[i+8];
-        r[i+8] += (sp_digit)((t[7] >> 21) + (t[0] & 0x1fffff));
+    t[0] = 0;
+    for (i = 0; i < 140; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[0]  = t[3] >> 29;
     }
-    t[1] = tb * a[193];
-    r[193] += (sp_digit)((t[0] >> 21) + (t[1] & 0x1fffff));
-    t[2] = tb * a[194];
-    r[194] += (sp_digit)((t[1] >> 21) + (t[2] & 0x1fffff));
-    t[3] = tb * a[195];
-    r[195] += (sp_digit)((t[2] >> 21) + (t[3] & 0x1fffff));
-    r[196] +=  (sp_digit)(t[3] >> 21);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 21.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_4096_norm_196(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 195; i++) {
-        a[i+1] += a[i] >> 21;
-        a[i] &= 0x1fffff;
-    }
-#else
-    int i;
-    for (i = 0; i < 192; i += 8) {
-        a[i+1] += a[i+0] >> 21; a[i+0] &= 0x1fffff;
-        a[i+2] += a[i+1] >> 21; a[i+1] &= 0x1fffff;
-        a[i+3] += a[i+2] >> 21; a[i+2] &= 0x1fffff;
-        a[i+4] += a[i+3] >> 21; a[i+3] &= 0x1fffff;
-        a[i+5] += a[i+4] >> 21; a[i+4] &= 0x1fffff;
-        a[i+6] += a[i+5] >> 21; a[i+5] &= 0x1fffff;
-        a[i+7] += a[i+6] >> 21; a[i+6] &= 0x1fffff;
-        a[i+8] += a[i+7] >> 21; a[i+7] &= 0x1fffff;
-    }
-    a[192+1] += a[192] >> 21; a[192] &= 0x1fffff;
-    a[193+1] += a[193] >> 21; a[193] &= 0x1fffff;
-    a[194+1] += a[194] >> 21; a[194] &= 0x1fffff;
-#endif
+    t[0] += (tb * a[140]) + r[140];
+    t[1]  = (tb * a[141]) + r[141];
+    r[140] = t[0] & 0x1fffffff;
+    t[1] += t[0] >> 29;
+    r[141] = t[1] & 0x1fffffff;
+    r[142] +=  (sp_digit)(t[1] >> 29);
+#endif /* !WOLFSSL_SP_LARGE_CODE */
 }
 
 /* Shift the result in the high 4096 bits down to the bottom.
@@ -10357,47 +13988,19 @@ static void sp_4096_norm_196(sp_digit* a)
  * r  A single precision number.
  * a  A single precision number.
  */
-static void sp_4096_mont_shift_196(sp_digit* r, const sp_digit* a)
+static void sp_4096_mont_shift_142(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
-    int64_t n = a[195] >> 1;
-    n += ((int64_t)a[196]) << 20;
+    sp_int64 n = a[141] >> 7;
+    n += ((sp_int64)a[142]) << 22;
 
-    for (i = 0; i < 195; i++) {
-        r[i] = n & 0x1fffff;
-        n >>= 21;
-        n += ((int64_t)a[197 + i]) << 20;
+    for (i = 0; i < 141; i++) {
+        r[i] = n & 0x1fffffff;
+        n >>= 29;
+        n += ((sp_int64)a[143 + i]) << 22;
     }
-    r[195] = (sp_digit)n;
-#else
-    int i;
-    int64_t n = a[195] >> 1;
-    n += ((int64_t)a[196]) << 20;
-    for (i = 0; i < 192; i += 8) {
-        r[i + 0] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 197]) << 20;
-        r[i + 1] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 198]) << 20;
-        r[i + 2] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 199]) << 20;
-        r[i + 3] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 200]) << 20;
-        r[i + 4] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 201]) << 20;
-        r[i + 5] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 202]) << 20;
-        r[i + 6] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 203]) << 20;
-        r[i + 7] = n & 0x1fffff;
-        n >>= 21; n += ((int64_t)a[i + 204]) << 20;
-    }
-    r[192] = n & 0x1fffff; n >>= 21; n += ((int64_t)a[389]) << 20;
-    r[193] = n & 0x1fffff; n >>= 21; n += ((int64_t)a[390]) << 20;
-    r[194] = n & 0x1fffff; n >>= 21; n += ((int64_t)a[391]) << 20;
-    r[195] = (sp_digit)n;
-#endif /* WOLFSSL_SP_SMALL */
-    XMEMSET(&r[196], 0, sizeof(*r) * 196U);
+    r[141] = (sp_digit)n;
+    XMEMSET(&r[142], 0, sizeof(*r) * 142U);
 }
 
 /* Reduce the number back to 4096 bits using Montgomery reduction.
@@ -10406,51 +14009,51 @@ static void sp_4096_mont_shift_196(sp_digit* r, const sp_digit* a)
  * m   The single precision number representing the modulus.
  * mp  The digit representing the negative inverse of m mod 2^n.
  */
-static void sp_4096_mont_reduce_196(sp_digit* a, const sp_digit* m, sp_digit mp)
+static void sp_4096_mont_reduce_142(sp_digit* a, const sp_digit* m, sp_digit mp)
 {
     int i;
     sp_digit mu;
 
-    sp_4096_norm_196(a + 196);
+    sp_4096_norm_142(a + 142);
 
 #ifdef WOLFSSL_SP_DH
     if (mp != 1) {
-        for (i=0; i<195; i++) {
-            mu = (a[i] * mp) & 0x1fffff;
-            sp_4096_mul_add_196(a+i, m, mu);
-            a[i+1] += a[i] >> 21;
+        for (i=0; i<141; i++) {
+            mu = (a[i] * mp) & 0x1fffffff;
+            sp_4096_mul_add_142(a+i, m, mu);
+            a[i+1] += a[i] >> 29;
         }
-        mu = (a[i] * mp) & 0x1L;
-        sp_4096_mul_add_196(a+i, m, mu);
-        a[i+1] += a[i] >> 21;
-        a[i] &= 0x1fffff;
+        mu = (a[i] * mp) & 0x7fL;
+        sp_4096_mul_add_142(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
     }
     else {
-        for (i=0; i<195; i++) {
-            mu = a[i] & 0x1fffff;
-            sp_4096_mul_add_196(a+i, m, mu);
-            a[i+1] += a[i] >> 21;
+        for (i=0; i<141; i++) {
+            mu = a[i] & 0x1fffffff;
+            sp_4096_mul_add_142(a+i, m, mu);
+            a[i+1] += a[i] >> 29;
         }
-        mu = a[i] & 0x1L;
-        sp_4096_mul_add_196(a+i, m, mu);
-        a[i+1] += a[i] >> 21;
-        a[i] &= 0x1fffff;
+        mu = a[i] & 0x7fL;
+        sp_4096_mul_add_142(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
     }
 #else
-    for (i=0; i<195; i++) {
-        mu = (a[i] * mp) & 0x1fffff;
-        sp_4096_mul_add_196(a+i, m, mu);
-        a[i+1] += a[i] >> 21;
+    for (i=0; i<141; i++) {
+        mu = (a[i] * mp) & 0x1fffffff;
+        sp_4096_mul_add_142(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
     }
-    mu = (a[i] * mp) & 0x1L;
-    sp_4096_mul_add_196(a+i, m, mu);
-    a[i+1] += a[i] >> 21;
-    a[i] &= 0x1fffff;
+    mu = (a[i] * mp) & 0x7fL;
+    sp_4096_mul_add_142(a+i, m, mu);
+    a[i+1] += a[i] >> 29;
+    a[i] &= 0x1fffffff;
 #endif
-    sp_4096_mont_shift_196(a, a);
-    sp_4096_cond_sub_196(a, a, m, 0 - (((a[195] >> 1) > 0) ?
+    sp_4096_mont_shift_142(a, a);
+    sp_4096_cond_sub_142(a, a, m, 0 - (((a[141] - m[141]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
-    sp_4096_norm_196(a);
+    sp_4096_norm_142(a);
 }
 
 /* Multiply two Montogmery form numbers mod the modulus (prime).
@@ -10462,11 +14065,11 @@ static void sp_4096_mont_reduce_196(sp_digit* a, const sp_digit* m, sp_digit mp)
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_4096_mont_mul_196(sp_digit* r, const sp_digit* a,
+static void sp_4096_mont_mul_142(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit* m, sp_digit mp)
 {
-    sp_4096_mul_196(r, a, b);
-    sp_4096_mont_reduce_196(r, m, mp);
+    sp_4096_mul_142(r, a, b);
+    sp_4096_mont_reduce_142(r, m, mp);
 }
 
 /* Square the Montgomery form number. (r = a * a mod m)
@@ -10476,11 +14079,11 @@ static void sp_4096_mont_mul_196(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montogmery mulitplier.
  */
-static void sp_4096_mont_sqr_196(sp_digit* r, const sp_digit* a,
+static void sp_4096_mont_sqr_142(sp_digit* r, const sp_digit* a,
         const sp_digit* m, sp_digit mp)
 {
-    sp_4096_sqr_196(r, a);
-    sp_4096_mont_reduce_196(r, m, mp);
+    sp_4096_sqr_142(r, a);
+    sp_4096_mont_reduce_142(r, m, mp);
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -10489,51 +14092,19 @@ static void sp_4096_mont_sqr_196(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_4096_mul_d_392(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_4096_mul_d_284(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
-    for (i = 0; i < 392; i++) {
+    for (i = 0; i < 284; i++) {
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
     }
-    r[392] = (sp_digit)t;
-#else
-    int64_t tb = b;
-    int64_t t = 0;
-    sp_digit t2;
-    int64_t p[4];
-    int i;
-
-    for (i = 0; i < 392; i += 4) {
-        p[0] = tb * a[i + 0];
-        p[1] = tb * a[i + 1];
-        p[2] = tb * a[i + 2];
-        p[3] = tb * a[i + 3];
-        t += p[0];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 0] = (sp_digit)t2;
-        t += p[1];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 1] = (sp_digit)t2;
-        t += p[2];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 2] = (sp_digit)t2;
-        t += p[3];
-        t2 = (sp_digit)(t & 0x1fffff);
-        t >>= 21;
-        r[i + 3] = (sp_digit)t2;
-    }
-    r[392] = (sp_digit)(t & 0x1fffff);
-#endif /* WOLFSSL_SP_SMALL */
+    r[284] = (sp_digit)t;
 }
 
 /* Conditionally add a and b using the mask m.
@@ -10544,83 +14115,162 @@ SP_NOINLINE static void sp_4096_mul_d_392(sp_digit* r, const sp_digit* a,
  * b  A single precision number to add.
  * m  Mask value to apply.
  */
-static void sp_4096_cond_add_196(sp_digit* r, const sp_digit* a,
+static void sp_4096_cond_add_142(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 196; i++) {
+    for (i = 0; i < 71; i++) {
         r[i] = a[i] + (b[i] & m);
     }
-#else
-    int i;
-
-    for (i = 0; i < 192; i += 8) {
-        r[i + 0] = a[i + 0] + (b[i + 0] & m);
-        r[i + 1] = a[i + 1] + (b[i + 1] & m);
-        r[i + 2] = a[i + 2] + (b[i + 2] & m);
-        r[i + 3] = a[i + 3] + (b[i + 3] & m);
-        r[i + 4] = a[i + 4] + (b[i + 4] & m);
-        r[i + 5] = a[i + 5] + (b[i + 5] & m);
-        r[i + 6] = a[i + 6] + (b[i + 6] & m);
-        r[i + 7] = a[i + 7] + (b[i + 7] & m);
-    }
-    r[192] = a[192] + (b[192] & m);
-    r[193] = a[193] + (b[193] & m);
-    r[194] = a[194] + (b[194] & m);
-    r[195] = a[195] + (b[195] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
-SP_NOINLINE static void sp_4096_rshift_196(sp_digit* r, const sp_digit* a,
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_142(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 142; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_4096_rshift_142(sp_digit* r, const sp_digit* a,
         byte n)
 {
     int i;
 
-#ifdef WOLFSSL_SP_SMALL
-    for (i=0; i<195; i++) {
-        r[i] = ((a[i] >> n) | (a[i + 1] << (21 - n))) & 0x1fffff;
+    for (i=0; i<141; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
     }
-#else
-    for (i=0; i<192; i += 8) {
-        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (21 - n)) & 0x1fffff);
-        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (21 - n)) & 0x1fffff);
-        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (21 - n)) & 0x1fffff);
-        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (21 - n)) & 0x1fffff);
-        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (21 - n)) & 0x1fffff);
-        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (21 - n)) & 0x1fffff);
-        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (21 - n)) & 0x1fffff);
-        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (21 - n)) & 0x1fffff);
-    }
-    r[192] = (a[192] >> n) | ((a[193] << (21 - n)) & 0x1fffff);
-    r[193] = (a[193] >> n) | ((a[194] << (21 - n)) & 0x1fffff);
-    r[194] = (a[194] >> n) | ((a[195] << (21 - n)) & 0x1fffff);
-#endif
-    r[195] = a[195] >> n;
+    r[141] = a[141] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_32
-static WC_INLINE sp_digit sp_4096_div_word_196(sp_digit d1, sp_digit d0,
+static WC_INLINE sp_digit sp_4096_div_word_142(sp_digit d1, sp_digit d0,
     sp_digit dv)
 {
     sp_digit d;
     sp_digit r;
     sp_digit t;
 
-    /* All 21 bits from d1 and top 10 bits from d0. */
-    d = (d1 << 10) + (d0 >> 11);
+    /* All 29 bits from d1 and top 2 bits from d0. */
+    d = (d1 << 2) + (d0 >> 27);
     r = d / dv;
     d -= r * dv;
+    /* Up to 3 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 25) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 5 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 23) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 21) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 9 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 19) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
     /* Up to 11 bits in r */
-    /* Next 10 bits from d0. */
-    r <<= 10;
-    d <<= 10;
-    d += (d0 >> 1) & ((1 << 10) - 1);
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 17) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 15) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 15 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 13) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 17 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 11) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 9) & ((1 << 2) - 1);
     t = d / dv;
     d -= t * dv;
     r += t;
     /* Up to 21 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 7) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 23 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 5) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 3) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 27 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 1) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
     /* Remaining 1 bits from d0. */
     r <<= 1;
     d <<= 1;
@@ -10628,7 +14278,7 @@ static WC_INLINE sp_digit sp_4096_div_word_196(sp_digit d1, sp_digit d0,
     t = d / dv;
     r += t;
 
-    /* All 21 bits from d1 and top 10 bits from d0. */
+    /* All 29 bits from d1 and top 2 bits from d0. */
     return r;
 }
 #endif /* WOLFSSL_SP_DIV_32 */
@@ -10644,19 +14294,19 @@ static WC_INLINE sp_digit sp_4096_div_word_196(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_4096_div_196(const sp_digit* a, const sp_digit* d,
+static int sp_4096_div_142(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
+    sp_int64 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[4 * 196 + 3];
+    sp_digit t1[4 * 142 + 3];
 #endif
     sp_digit* t2 = NULL;
     sp_digit* sd = NULL;
@@ -10665,7 +14315,7 @@ static int sp_4096_div_196(const sp_digit* a, const sp_digit* d,
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 196 + 3), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 142 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
@@ -10674,53 +14324,60 @@ static int sp_4096_div_196(const sp_digit* a, const sp_digit* d,
     (void)m;
 
     if (err == MP_OKAY) {
-        t2 = t1 + 392 + 1;
-        sd = t2 + 196 + 1;
+        t2 = t1 + 284 + 1;
+        sd = t2 + 142 + 1;
 
-        sp_4096_mul_d_196(sd, d, (sp_digit)1 << 20);
-        sp_4096_mul_d_392(t1, a, (sp_digit)1 << 20);
-        dv = sd[195];
-        t1[196 + 196] += t1[196 + 196 - 1] >> 21;
-        t1[196 + 196 - 1] &= 0x1fffff;
-        for (i=196; i>=0; i--) {
+        sp_4096_mul_d_142(sd, d, (sp_digit)1 << 22);
+        sp_4096_mul_d_284(t1, a, (sp_digit)1 << 22);
+        dv = sd[141];
+        t1[142 + 142] += t1[142 + 142 - 1] >> 29;
+        t1[142 + 142 - 1] &= 0x1fffffff;
+        for (i=142; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_32
-            d1 = t1[196 + i];
-            d1 <<= 21;
-            d1 += t1[196 + i - 1];
+            d1 = t1[142 + i];
+            d1 <<= 29;
+            d1 += t1[142 + i - 1];
             r1 = (sp_digit)(d1 / dv);
 #else
-            r1 = sp_4096_div_word_196(t1[196 + i], t1[196 + i - 1], dv);
+            r1 = sp_4096_div_word_142(t1[142 + i], t1[142 + i - 1], dv);
 #endif
 
-            sp_4096_mul_d_196(t2, sd, r1);
-            (void)sp_4096_sub_196(&t1[i], &t1[i], t2);
-            sp_4096_norm_196(&t1[i]);
-            t1[196 + i] -= t2[196];
-            t1[196 + i] += t1[196 + i - 1] >> 21;
-            t1[196 + i - 1] &= 0x1fffff;
-            r1 = (((-t1[196 + i]) << 21) - t1[196 + i - 1]) / dv;
-            r1 -= t1[196 + i];
-            sp_4096_mul_d_196(t2, sd, r1);
-            (void)sp_4096_add_196(&t1[i], &t1[i], t2);
-            t1[196 + i] += t1[196 + i - 1] >> 21;
-            t1[196 + i - 1] &= 0x1fffff;
+            sp_4096_mul_d_142(t2, sd, r1);
+            (void)sp_4096_sub_142(&t1[i], &t1[i], t2);
+            sp_4096_norm_142(&t1[i]);
+            t1[142 + i] -= t2[142];
+            t1[142 + i] += t1[142 + i - 1] >> 29;
+            t1[142 + i - 1] &= 0x1fffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[142 + i];
+            d1 <<= 29;
+            d1 -= t1[142 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_142(-t1[142 + i], -t1[142 + i - 1], dv);
+#endif
+            r1 -= t1[142 + i];
+            sp_4096_mul_d_142(t2, sd, r1);
+            (void)sp_4096_add_142(&t1[i], &t1[i], t2);
+            t1[142 + i] += t1[142 + i - 1] >> 29;
+            t1[142 + i - 1] &= 0x1fffffff;
         }
-        t1[196 - 1] += t1[196 - 2] >> 21;
-        t1[196 - 2] &= 0x1fffff;
-        r1 = t1[196 - 1] / dv;
+        t1[142 - 1] += t1[142 - 2] >> 29;
+        t1[142 - 2] &= 0x1fffffff;
+        r1 = t1[142 - 1] / dv;
 
-        sp_4096_mul_d_196(t2, sd, r1);
-        sp_4096_sub_196(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 392U);
-        for (i=0; i<195; i++) {
-            r[i+1] += r[i] >> 21;
-            r[i] &= 0x1fffff;
+        sp_4096_mul_d_142(t2, sd, r1);
+        sp_4096_sub_142(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 284U);
+        for (i=0; i<141; i++) {
+            r[i+1] += r[i] >> 29;
+            r[i] &= 0x1fffffff;
         }
-        sp_4096_cond_add_196(r, r, sd, 0 - ((r[195] < 0) ?
+        sp_4096_cond_add_142(r, r, sd, 0 - ((r[141] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
 
-        sp_4096_norm_196(r);
-        sp_4096_rshift_196(r, r, 20);
+        sp_4096_norm_142(r);
+        sp_4096_rshift_142(r, r, 22);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -10738,13 +14395,11 @@ static int sp_4096_div_196(const sp_digit* a, const sp_digit* d,
  * m  A single precision number that is the modulus to reduce with.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_4096_mod_196(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static int sp_4096_mod_142(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
-    return sp_4096_div_196(a, m, NULL, r);
+    return sp_4096_div_142(a, m, NULL, r);
 }
 
-#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
-                                                     defined(WOLFSSL_HAVE_SP_DH)
 /* Modular exponentiate a to the e mod m. (r = a^e mod m)
  *
  * r     A single precision number that is the result of the operation.
@@ -10754,14 +14409,14 @@ static int sp_4096_mod_196(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * m     A single precision number that is the modulus.
  * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
  */
-static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e,
+static int sp_4096_mod_exp_142(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int bits, const sp_digit* m, int reduceA)
 {
 #if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 392];
+    sp_digit td[3 * 284];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -10773,7 +14428,7 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 196 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 142 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -10782,29 +14437,29 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 196 * 2);
-            XMEMSET(t[i], 0, sizeof(sp_digit) * 196U * 2U);
+            t[i] = td + (i * 142 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 142U * 2U);
         }
 
         sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_196(norm, m);
+        sp_4096_mont_norm_142(norm, m);
 
         if (reduceA != 0) {
-            err = sp_4096_mod_196(t[1], a, m);
+            err = sp_4096_mod_142(t[1], a, m);
         }
         else {
-            XMEMCPY(t[1], a, sizeof(sp_digit) * 196U);
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 142U);
         }
     }
     if (err == MP_OKAY) {
-        sp_4096_mul_196(t[1], t[1], norm);
-        err = sp_4096_mod_196(t[1], t[1], m);
+        sp_4096_mul_142(t[1], t[1], norm);
+        err = sp_4096_mod_142(t[1], t[1], m);
     }
 
     if (err == MP_OKAY) {
-        i = bits / 21;
-        c = bits % 21;
-        n = e[i--] << (21 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -10812,28 +14467,28 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
                 }
 
                 n = e[i--];
-                c = 21;
+                c = 29;
             }
 
-            y = (int)((n >> 20) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_4096_mont_mul_196(t[y^1], t[0], t[1], m, mp);
+            sp_4096_mont_mul_142(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 196 * 2);
-            sp_4096_mont_sqr_196(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 142 * 2);
+            sp_4096_mont_sqr_142(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 196 * 2);
+                            sizeof(*t[2]) * 142 * 2);
         }
 
-        sp_4096_mont_reduce_196(t[0], m, mp);
-        n = sp_4096_cmp_196(t[0], m);
-        sp_4096_cond_sub_196(t[0], t[0], m, ((n < 0) ?
+        sp_4096_mont_reduce_142(t[0], m, mp);
+        n = sp_4096_cmp_142(t[0], m);
+        sp_4096_cond_sub_142(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 196 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 142 * 2);
 
     }
 
@@ -10847,7 +14502,7 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[3 * 392];
+    sp_digit td[3 * 284];
 #endif
     sp_digit* t[3] = {0, 0, 0};
     sp_digit* norm = NULL;
@@ -10859,7 +14514,7 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 196 * 2, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 142 * 2, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -10868,29 +14523,29 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
     if (err == MP_OKAY) {
         norm = td;
         for (i=0; i<3; i++) {
-            t[i] = td + (i * 196 * 2);
+            t[i] = td + (i * 142 * 2);
         }
 
         sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_196(norm, m);
+        sp_4096_mont_norm_142(norm, m);
 
         if (reduceA != 0) {
-            err = sp_4096_mod_196(t[1], a, m);
+            err = sp_4096_mod_142(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_4096_mul_196(t[1], t[1], norm);
-                err = sp_4096_mod_196(t[1], t[1], m);
+                sp_4096_mul_142(t[1], t[1], norm);
+                err = sp_4096_mod_142(t[1], t[1], m);
             }
         }
         else {
-            sp_4096_mul_196(t[1], a, norm);
-            err = sp_4096_mod_196(t[1], t[1], m);
+            sp_4096_mul_142(t[1], a, norm);
+            err = sp_4096_mod_142(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        i = bits / 21;
-        c = bits % 21;
-        n = e[i--] << (21 - c);
+        i = bits / 29;
+        c = bits % 29;
+        n = e[i--] << (29 - c);
         for (; ; c--) {
             if (c == 0) {
                 if (i == -1) {
@@ -10898,28 +14553,28 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
                 }
 
                 n = e[i--];
-                c = 21;
+                c = 29;
             }
 
-            y = (int)((n >> 20) & 1);
+            y = (int)((n >> 28) & 1);
             n <<= 1;
 
-            sp_4096_mont_mul_196(t[y^1], t[0], t[1], m, mp);
+            sp_4096_mont_mul_142(t[y^1], t[0], t[1], m, mp);
 
             XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                   ((size_t)t[1] & addr_mask[y])),
-                                  sizeof(*t[2]) * 196 * 2);
-            sp_4096_mont_sqr_196(t[2], t[2], m, mp);
+                                  sizeof(*t[2]) * 142 * 2);
+            sp_4096_mont_sqr_142(t[2], t[2], m, mp);
             XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                             ((size_t)t[1] & addr_mask[y])), t[2],
-                            sizeof(*t[2]) * 196 * 2);
+                            sizeof(*t[2]) * 142 * 2);
         }
 
-        sp_4096_mont_reduce_196(t[0], m, mp);
-        n = sp_4096_cmp_196(t[0], m);
-        sp_4096_cond_sub_196(t[0], t[0], m, ((n < 0) ?
+        sp_4096_mont_reduce_142(t[0], m, mp);
+        n = sp_4096_cmp_142(t[0], m);
+        sp_4096_cond_sub_142(t[0], t[0], m, ((n < 0) ?
                     (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, t[0], sizeof(*r) * 196 * 2);
+        XMEMCPY(r, t[0], sizeof(*r) * 142 * 2);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -10932,9 +14587,9 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[(32 * 392) + 392];
+    sp_digit td[(16 * 284) + 284];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm = NULL;
     sp_digit mp = 1;
@@ -10945,7 +14600,7 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 392) + 392), NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 284) + 284), NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -10953,102 +14608,97 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
 
     if (err == MP_OKAY) {
         norm = td;
-        for (i=0; i<32; i++)
-            t[i] = td + i * 392;
-        rt = td + 12544;
+        for (i=0; i<16; i++)
+            t[i] = td + i * 284;
+        rt = td + 4544;
 
         sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_196(norm, m);
+        sp_4096_mont_norm_142(norm, m);
 
         if (reduceA != 0) {
-            err = sp_4096_mod_196(t[1], a, m);
+            err = sp_4096_mod_142(t[1], a, m);
             if (err == MP_OKAY) {
-                sp_4096_mul_196(t[1], t[1], norm);
-                err = sp_4096_mod_196(t[1], t[1], m);
+                sp_4096_mul_142(t[1], t[1], norm);
+                err = sp_4096_mod_142(t[1], t[1], m);
             }
         }
         else {
-            sp_4096_mul_196(t[1], a, norm);
-            err = sp_4096_mod_196(t[1], t[1], m);
+            sp_4096_mul_142(t[1], a, norm);
+            err = sp_4096_mod_142(t[1], t[1], m);
         }
     }
 
     if (err == MP_OKAY) {
-        sp_4096_mont_sqr_196(t[ 2], t[ 1], m, mp);
-        sp_4096_mont_mul_196(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_4096_mont_sqr_196(t[ 4], t[ 2], m, mp);
-        sp_4096_mont_mul_196(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_4096_mont_sqr_196(t[ 6], t[ 3], m, mp);
-        sp_4096_mont_mul_196(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_4096_mont_sqr_196(t[ 8], t[ 4], m, mp);
-        sp_4096_mont_mul_196(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_4096_mont_sqr_196(t[10], t[ 5], m, mp);
-        sp_4096_mont_mul_196(t[11], t[ 6], t[ 5], m, mp);
-        sp_4096_mont_sqr_196(t[12], t[ 6], m, mp);
-        sp_4096_mont_mul_196(t[13], t[ 7], t[ 6], m, mp);
-        sp_4096_mont_sqr_196(t[14], t[ 7], m, mp);
-        sp_4096_mont_mul_196(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_196(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_196(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_196(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_196(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_196(t[20], t[10], m, mp);
-        sp_4096_mont_mul_196(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_196(t[22], t[11], m, mp);
-        sp_4096_mont_mul_196(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_196(t[24], t[12], m, mp);
-        sp_4096_mont_mul_196(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_196(t[26], t[13], m, mp);
-        sp_4096_mont_mul_196(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_196(t[28], t[14], m, mp);
-        sp_4096_mont_mul_196(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_196(t[30], t[15], m, mp);
-        sp_4096_mont_mul_196(t[31], t[16], t[15], m, mp);
+        sp_4096_mont_sqr_142(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_142(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_142(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_142(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_142(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_142(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_4096_mont_sqr_142(t[ 8], t[ 4], m, mp);
+        sp_4096_mont_mul_142(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_4096_mont_sqr_142(t[10], t[ 5], m, mp);
+        sp_4096_mont_mul_142(t[11], t[ 6], t[ 5], m, mp);
+        sp_4096_mont_sqr_142(t[12], t[ 6], m, mp);
+        sp_4096_mont_mul_142(t[13], t[ 7], t[ 6], m, mp);
+        sp_4096_mont_sqr_142(t[14], t[ 7], m, mp);
+        sp_4096_mont_mul_142(t[15], t[ 8], t[ 7], m, mp);
 
-        bits = ((bits + 4) / 5) * 5;
-        i = ((bits + 20) / 21) - 1;
-        c = bits % 21;
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
         if (c == 0) {
-            c = 21;
+            c = 29;
         }
-        if (i < 196) {
+        if (i < 142) {
             n = e[i--] << (32 - c);
         }
         else {
             n = 0;
             i--;
         }
-        if (c < 5) {
-            n |= e[i--] << (11 - c);
-            c += 21;
+        if (c < 4) {
+            n |= e[i--] << (3 - c);
+            c += 29;
         }
-        y = (int)((n >> 27) & 0x1f);
-        n <<= 5;
-        c -= 5;
-        XMEMCPY(rt, t[y], sizeof(sp_digit) * 392);
-        while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (11 - c);
-                c += 21;
+        y = (int)((n >> 28) & 0xf);
+        n <<= 4;
+        c -= 4;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 284);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 25;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 3;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
             }
-            y = (int)((n >> 27) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
-            sp_4096_mont_sqr_196(rt, rt, m, mp);
-            sp_4096_mont_sqr_196(rt, rt, m, mp);
-            sp_4096_mont_sqr_196(rt, rt, m, mp);
-            sp_4096_mont_sqr_196(rt, rt, m, mp);
-            sp_4096_mont_sqr_196(rt, rt, m, mp);
+            sp_4096_mont_sqr_142(rt, rt, m, mp);
+            sp_4096_mont_sqr_142(rt, rt, m, mp);
+            sp_4096_mont_sqr_142(rt, rt, m, mp);
+            sp_4096_mont_sqr_142(rt, rt, m, mp);
 
-            sp_4096_mont_mul_196(rt, rt, t[y], m, mp);
+            sp_4096_mont_mul_142(rt, rt, t[y], m, mp);
         }
 
-        sp_4096_mont_reduce_196(rt, m, mp);
-        n = sp_4096_cmp_196(rt, m);
-        sp_4096_cond_sub_196(rt, rt, m, ((n < 0) ?
+        sp_4096_mont_reduce_142(rt, m, mp);
+        n = sp_4096_cmp_142(rt, m);
+        sp_4096_cond_sub_142(rt, rt, m, ((n < 0) ?
                    (sp_digit)1 : (sp_digit)0) - 1);
-        XMEMCPY(r, rt, sizeof(sp_digit) * 392);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 284);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -11059,8 +14709,6 @@ static int sp_4096_mod_exp_196(sp_digit* r, const sp_digit* a, const sp_digit* e
     return err;
 #endif
 }
-#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */
-       /* WOLFSSL_HAVE_SP_DH */
 
 #ifdef WOLFSSL_HAVE_SP_RSA
 /* RSA public key operation.
@@ -11082,7 +14730,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[196 * 5];
+    sp_digit a[142 * 5];
 #endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
@@ -11097,7 +14745,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        if (mp_count_bits(em) > 21) {
+        if (mp_count_bits(em) > 29) {
             err = MP_READ_E;
         }
         else if (inLen > 512U) {
@@ -11113,7 +14761,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 196 * 5, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 5, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -11121,12 +14769,12 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
 #endif
 
     if (err == MP_OKAY) {
-        r = a + 196 * 2;
-        m = r + 196 * 2;
+        r = a + 142 * 2;
+        m = r + 142 * 2;
         norm = r;
 
-        sp_4096_from_bin(a, 196, in, inLen);
-#if DIGIT_BIT >= 21
+        sp_4096_from_bin(a, 142, in, inLen);
+#if DIGIT_BIT >= 29
         e[0] = (sp_digit)em->dp[0];
 #else
         e[0] = (sp_digit)em->dp[0];
@@ -11140,36 +14788,36 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_from_mp(m, 196, mm);
+        sp_4096_from_mp(m, 142, mm);
 
         sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_196(norm, m);
+        sp_4096_mont_norm_142(norm, m);
     }
     if (err == MP_OKAY) {
-        sp_4096_mul_196(a, a, norm);
-        err = sp_4096_mod_196(a, a, m);
+        sp_4096_mul_142(a, a, norm);
+        err = sp_4096_mod_142(a, a, m);
     }
     if (err == MP_OKAY) {
-        for (i=20; i>=0; i--) {
+        for (i=28; i>=0; i--) {
             if ((e[0] >> i) != 0) {
                 break;
             }
         }
 
-        XMEMCPY(r, a, sizeof(sp_digit) * 196 * 2);
+        XMEMCPY(r, a, sizeof(sp_digit) * 142 * 2);
         for (i--; i>=0; i--) {
-            sp_4096_mont_sqr_196(r, r, m, mp);
+            sp_4096_mont_sqr_142(r, r, m, mp);
 
             if (((e[0] >> i) & 1) == 1) {
-                sp_4096_mont_mul_196(r, r, a, m, mp);
+                sp_4096_mont_mul_142(r, r, a, m, mp);
             }
         }
-        sp_4096_mont_reduce_196(r, m, mp);
-        mp = sp_4096_cmp_196(r, m);
-        sp_4096_cond_sub_196(r, r, m, ((mp < 0) ?
+        sp_4096_mont_reduce_142(r, m, mp);
+        mp = sp_4096_cmp_142(r, m);
+        sp_4096_cond_sub_142(r, r, m, ((mp < 0) ?
                     (sp_digit)1 : (sp_digit)0)- 1);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_142(r, out);
         *outLen = 512;
     }
 
@@ -11183,7 +14831,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit d[196 * 5];
+    sp_digit d[142 * 5];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -11195,7 +14843,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
         err = MP_TO_E;
     }
     if (err == MP_OKAY) {
-        if (mp_count_bits(em) > 21) {
+        if (mp_count_bits(em) > 29) {
             err = MP_READ_E;
         }
         else if (inLen > 512U) {
@@ -11211,7 +14859,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 196 * 5, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 5, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -11220,11 +14868,11 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
 
     if (err == MP_OKAY) {
         a = d;
-        r = a + 196 * 2;
-        m = r + 196 * 2;
+        r = a + 142 * 2;
+        m = r + 142 * 2;
 
-        sp_4096_from_bin(a, 196, in, inLen);
-#if DIGIT_BIT >= 21
+        sp_4096_from_bin(a, 142, in, inLen);
+#if DIGIT_BIT >= 29
         e[0] = (sp_digit)em->dp[0];
 #else
         e[0] = (sp_digit)em->dp[0];
@@ -11237,14 +14885,14 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
         }
     }
     if (err == MP_OKAY) {
-        sp_4096_from_mp(m, 196, mm);
+        sp_4096_from_mp(m, 142, mm);
 
         if (e[0] == 0x3) {
-            sp_4096_sqr_196(r, a);
-            err = sp_4096_mod_196(r, r, m);
+            sp_4096_sqr_142(r, a);
+            err = sp_4096_mod_142(r, r, m);
             if (err == MP_OKAY) {
-                sp_4096_mul_196(r, a, r);
-                err = sp_4096_mod_196(r, r, m);
+                sp_4096_mul_142(r, a, r);
+                err = sp_4096_mod_142(r, r, m);
             }
         }
         else {
@@ -11253,36 +14901,36 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
             sp_digit mp;
 
             sp_4096_mont_setup(m, &mp);
-            sp_4096_mont_norm_196(norm, m);
+            sp_4096_mont_norm_142(norm, m);
 
-            sp_4096_mul_196(a, a, norm);
-            err = sp_4096_mod_196(a, a, m);
+            sp_4096_mul_142(a, a, norm);
+            err = sp_4096_mod_142(a, a, m);
 
             if (err == MP_OKAY) {
-                for (i=20; i>=0; i--) {
+                for (i=28; i>=0; i--) {
                     if ((e[0] >> i) != 0) {
                         break;
                     }
                 }
 
-                XMEMCPY(r, a, sizeof(sp_digit) * 392U);
+                XMEMCPY(r, a, sizeof(sp_digit) * 284U);
                 for (i--; i>=0; i--) {
-                    sp_4096_mont_sqr_196(r, r, m, mp);
+                    sp_4096_mont_sqr_142(r, r, m, mp);
 
                     if (((e[0] >> i) & 1) == 1) {
-                        sp_4096_mont_mul_196(r, r, a, m, mp);
+                        sp_4096_mont_mul_142(r, r, a, m, mp);
                     }
                 }
-                sp_4096_mont_reduce_196(r, m, mp);
-                mp = sp_4096_cmp_196(r, m);
-                sp_4096_cond_sub_196(r, r, m, ((mp < 0) ?
+                sp_4096_mont_reduce_142(r, m, mp);
+                mp = sp_4096_cmp_142(r, m);
+                sp_4096_cond_sub_142(r, r, m, ((mp < 0) ?
                            (sp_digit)1 : (sp_digit)0) - 1);
             }
         }
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_142(r, out);
         *outLen = 512;
     }
 
@@ -11324,7 +14972,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit  d[196 * 4];
+    sp_digit  d[142 * 4];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -11357,7 +15005,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 196 * 4, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -11365,18 +15013,18 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        a = d + 196;
-        m = a + 392;
+        a = d + 142;
+        m = a + 284;
         r = a;
 
-        sp_4096_from_bin(a, 196, in, inLen);
-        sp_4096_from_mp(d, 196, dm);
-        sp_4096_from_mp(m, 196, mm);
-        err = sp_4096_mod_exp_196(r, a, d, 4096, m, 0);
+        sp_4096_from_bin(a, 142, in, inLen);
+        sp_4096_from_mp(d, 142, dm);
+        sp_4096_from_mp(m, 142, mm);
+        err = sp_4096_mod_exp_142(r, a, d, 4096, m, 0);
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_142(r, out);
         *outLen = 512;
     }
 
@@ -11386,7 +15034,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     {
         /* only "a" and "r" are sensitive and need zeroized (same pointer) */
         if (a != NULL)
-            ForceZero(a, sizeof(sp_digit) * 196);
+            ForceZero(a, sizeof(sp_digit) * 142);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(d, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -11397,7 +15045,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* d = NULL;
 #else
-    sp_digit d[196 * 4];
+    sp_digit d[142 * 4];
 #endif
     sp_digit* a = NULL;
     sp_digit* m = NULL;
@@ -11430,7 +15078,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 196 * 4, NULL,
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (d == NULL)
             err = MEMORY_E;
@@ -11438,18 +15086,18 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        a = d + 196;
-        m = a + 392;
+        a = d + 142;
+        m = a + 284;
         r = a;
 
-        sp_4096_from_bin(a, 196, in, inLen);
-        sp_4096_from_mp(d, 196, dm);
-        sp_4096_from_mp(m, 196, mm);
-        err = sp_4096_mod_exp_196(r, a, d, 4096, m, 0);
+        sp_4096_from_bin(a, 142, in, inLen);
+        sp_4096_from_mp(d, 142, dm);
+        sp_4096_from_mp(m, 142, mm);
+        err = sp_4096_mod_exp_142(r, a, d, 4096, m, 0);
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_142(r, out);
         *outLen = 512;
     }
 
@@ -11459,7 +15107,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     {
         /* only "a" and "r" are sensitive and need zeroized (same pointer) */
         if (a != NULL)
-            ForceZero(a, sizeof(sp_digit) * 196);
+            ForceZero(a, sizeof(sp_digit) * 142);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(d, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -11472,10 +15120,9 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[98 * 11];
+    sp_digit a[71 * 8];
 #endif
     sp_digit* p = NULL;
-    sp_digit* q = NULL;
     sp_digit* dp = NULL;
     sp_digit* dq = NULL;
     sp_digit* qi = NULL;
@@ -11504,47 +15151,48 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 98 * 11, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 71 * 8, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
     }
 #endif
     if (err == MP_OKAY) {
-        p = a + 196 * 2;
-        q = p + 98;
-        qi = dq = dp = q + 98;
-        tmpa = qi + 98;
-        tmpb = tmpa + 196;
-        r = a + 196;
+        p = a + 142;
+        qi = dq = dp = p + 71;
+        tmpa = qi + 71;
+        tmpb = tmpa + 142;
+        r = a;
 
-        sp_4096_from_bin(a, 196, in, inLen);
-        sp_4096_from_mp(p, 98, pm);
-        sp_4096_from_mp(q, 98, qm);
-        sp_4096_from_mp(dp, 98, dpm);
-        err = sp_4096_mod_exp_98(tmpa, a, dp, 2048, p, 1);
+        sp_4096_from_bin(a, 142, in, inLen);
+        sp_4096_from_mp(p, 71, pm);
+        sp_4096_from_mp(dp, 71, dpm);
+        err = sp_4096_mod_exp_71(tmpa, a, dp, 2048, p, 1);
     }
     if (err == MP_OKAY) {
-        sp_4096_from_mp(dq, 98, dqm);
-        err = sp_4096_mod_exp_98(tmpb, a, dq, 2048, q, 1);
+        sp_4096_from_mp(p, 71, qm);
+        sp_4096_from_mp(dq, 71, dqm);
+        err = sp_4096_mod_exp_71(tmpb, a, dq, 2048, p, 1);
     }
     if (err == MP_OKAY) {
-        (void)sp_4096_sub_98(tmpa, tmpa, tmpb);
-        sp_4096_norm_98(tmpa);
-        sp_4096_cond_add_98(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[97] >> 31));
-        sp_4096_cond_add_98(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[97] >> 31));
+        sp_4096_from_mp(p, 71, pm);
+        (void)sp_4096_sub_71(tmpa, tmpa, tmpb);
+        sp_4096_norm_71(tmpa);
+        sp_4096_cond_add_71(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[70] >> 31));
+        sp_4096_cond_add_71(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[70] >> 31));
 
-        sp_4096_from_mp(qi, 98, qim);
-        sp_4096_mul_98(tmpa, tmpa, qi);
-        err = sp_4096_mod_98(tmpa, tmpa, p);
+        sp_4096_from_mp(qi, 71, qim);
+        sp_4096_mul_71(tmpa, tmpa, qi);
+        err = sp_4096_mod_71(tmpa, tmpa, p);
     }
 
     if (err == MP_OKAY) {
-        sp_4096_mul_98(tmpa, q, tmpa);
-        (void)sp_4096_add_196(r, tmpb, tmpa);
-        sp_4096_norm_196(r);
+        sp_4096_from_mp(p, 71, qm);
+        sp_4096_mul_71(tmpa, p, tmpa);
+        (void)sp_4096_add_142(r, tmpb, tmpa);
+        sp_4096_norm_142(r);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_142(r, out);
         *outLen = 512;
     }
 
@@ -11552,7 +15200,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 98 * 11);
+        ForceZero(a, sizeof(sp_digit) * 71 * 8);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -11563,7 +15211,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[98 * 13];
+    sp_digit a[71 * 13];
 #endif
     sp_digit* p = NULL;
     sp_digit* q = NULL;
@@ -11595,7 +15243,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 98 * 13, NULL, 
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 71 * 13, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -11603,43 +15251,43 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 #endif
 
     if (err == MP_OKAY) {
-        p = a + 196 * 2;
-        q = p + 98;
-        dp = q + 98;
-        dq = dp + 98;
-        qi = dq + 98;
-        tmpa = qi + 98;
-        tmpb = tmpa + 196;
+        p = a + 142 * 2;
+        q = p + 71;
+        dp = q + 71;
+        dq = dp + 71;
+        qi = dq + 71;
+        tmpa = qi + 71;
+        tmpb = tmpa + 142;
         r = a;
 
-        sp_4096_from_bin(a, 196, in, inLen);
-        sp_4096_from_mp(p, 98, pm);
-        sp_4096_from_mp(q, 98, qm);
-        sp_4096_from_mp(dp, 98, dpm);
-        sp_4096_from_mp(dq, 98, dqm);
-        sp_4096_from_mp(qi, 98, qim);
+        sp_4096_from_bin(a, 142, in, inLen);
+        sp_4096_from_mp(p, 71, pm);
+        sp_4096_from_mp(q, 71, qm);
+        sp_4096_from_mp(dp, 71, dpm);
+        sp_4096_from_mp(dq, 71, dqm);
+        sp_4096_from_mp(qi, 71, qim);
 
-        err = sp_4096_mod_exp_98(tmpa, a, dp, 2048, p, 1);
+        err = sp_4096_mod_exp_71(tmpa, a, dp, 2048, p, 1);
     }
     if (err == MP_OKAY) {
-        err = sp_4096_mod_exp_98(tmpb, a, dq, 2048, q, 1);
+        err = sp_4096_mod_exp_71(tmpb, a, dq, 2048, q, 1);
     }
 
     if (err == MP_OKAY) {
-        (void)sp_4096_sub_98(tmpa, tmpa, tmpb);
-        sp_4096_norm_98(tmpa);
-        sp_4096_cond_add_98(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[97] >> 31));
-        sp_4096_cond_add_98(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[97] >> 31));
-        sp_4096_mul_98(tmpa, tmpa, qi);
-        err = sp_4096_mod_98(tmpa, tmpa, p);
+        (void)sp_4096_sub_71(tmpa, tmpa, tmpb);
+        sp_4096_norm_71(tmpa);
+        sp_4096_cond_add_71(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[70] >> 31));
+        sp_4096_cond_add_71(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[70] >> 31));
+        sp_4096_mul_71(tmpa, tmpa, qi);
+        err = sp_4096_mod_71(tmpa, tmpa, p);
     }
 
     if (err == MP_OKAY) {
-        sp_4096_mul_98(tmpa, tmpa, q);
-        (void)sp_4096_add_196(r, tmpb, tmpa);
-        sp_4096_norm_196(r);
+        sp_4096_mul_71(tmpa, tmpa, q);
+        (void)sp_4096_add_142(r, tmpb, tmpa);
+        sp_4096_norm_142(r);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_142(r, out);
         *outLen = 512;
     }
 
@@ -11647,7 +15295,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 98 * 13);
+        ForceZero(a, sizeof(sp_digit) * 71 * 13);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
     #endif
@@ -11673,22 +15321,22 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
 
     err = mp_grow(r, (4096 + DIGIT_BIT - 1) / DIGIT_BIT);
     if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
-#if DIGIT_BIT == 21
-        XMEMCPY(r->dp, a, sizeof(sp_digit) * 196);
-        r->used = 196;
+#if DIGIT_BIT == 29
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 142);
+        r->used = 142;
         mp_clamp(r);
-#elif DIGIT_BIT < 21
+#elif DIGIT_BIT < 29
         int i;
         int j = 0;
         int s = 0;
 
         r->dp[0] = 0;
-        for (i = 0; i < 196; i++) {
+        for (i = 0; i < 142; i++) {
             r->dp[j] |= (mp_digit)(a[i] << s);
             r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
             s = DIGIT_BIT - s;
             r->dp[++j] = (mp_digit)(a[i] >> s);
-            while (s + DIGIT_BIT <= 21) {
+            while (s + DIGIT_BIT <= 29) {
                 s += DIGIT_BIT;
                 r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                 if (s == SP_WORD_SIZE) {
@@ -11698,7 +15346,7 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
                     r->dp[j] = (mp_digit)(a[i] >> s);
                 }
             }
-            s = 21 - s;
+            s = 29 - s;
         }
         r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
         mp_clamp(r);
@@ -11708,18 +15356,18 @@ static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
         int s = 0;
 
         r->dp[0] = 0;
-        for (i = 0; i < 196; i++) {
+        for (i = 0; i < 142; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
-            if (s + 21 >= DIGIT_BIT) {
+            if (s + 29 >= DIGIT_BIT) {
     #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;
                 r->dp[++j] = a[i] >> s;
-                s = 21 - s;
+                s = 29 - s;
             }
             else {
-                s += 21;
+                s += 29;
             }
         }
         r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
@@ -11747,7 +15395,7 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[196 * 4];
+    sp_digit b[142 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -11769,7 +15417,7 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 196 * 4, NULL,
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL,
             DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
@@ -11777,15 +15425,15 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 196 * 2;
-        m = e + 196;
+        e = b + 142 * 2;
+        m = e + 142;
         r = b;
 
-        sp_4096_from_mp(b, 196, base);
-        sp_4096_from_mp(e, 196, exp);
-        sp_4096_from_mp(m, 196, mod);
+        sp_4096_from_mp(b, 142, base);
+        sp_4096_from_mp(e, 142, exp);
+        sp_4096_from_mp(m, 142, mod);
 
-        err = sp_4096_mod_exp_196(r, b, e, mp_count_bits(exp), m, 0);
+        err = sp_4096_mod_exp_142(r, b, e, mp_count_bits(exp), m, 0);
     }
 
     if (err == MP_OKAY) {
@@ -11798,7 +15446,7 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 196U);
+            ForceZero(e, sizeof(sp_digit) * 142U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -11808,7 +15456,7 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[196 * 4];
+    sp_digit b[142 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -11831,22 +15479,22 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 196 * 4, NULL, DYNAMIC_TYPE_DH);
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL, DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
     }
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 196 * 2;
-        m = e + 196;
+        e = b + 142 * 2;
+        m = e + 142;
         r = b;
 
-        sp_4096_from_mp(b, 196, base);
-        sp_4096_from_mp(e, 196, exp);
-        sp_4096_from_mp(m, 196, mod);
+        sp_4096_from_mp(b, 142, base);
+        sp_4096_from_mp(e, 142, exp);
+        sp_4096_from_mp(m, 142, mod);
 
-        err = sp_4096_mod_exp_196(r, b, e, expBits, m, 0);
+        err = sp_4096_mod_exp_142(r, b, e, expBits, m, 0);
     }
 
     if (err == MP_OKAY) {
@@ -11860,7 +15508,7 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 196U);
+            ForceZero(e, sizeof(sp_digit) * 142U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
@@ -11873,414 +15521,16 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
 #ifdef WOLFSSL_HAVE_SP_DH
 
 #ifdef HAVE_FFDHE_4096
-SP_NOINLINE static void sp_4096_lshift_196(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_4096_lshift_142(sp_digit* r, const sp_digit* a,
         byte n)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
 
-    r[196] = a[195] >> (21 - n);
-    for (i=195; i>0; i--) {
-        r[i] = ((a[i] << n) | (a[i-1] >> (21 - n))) & 0x1fffff;
+    r[142] = a[141] >> (29 - n);
+    for (i=141; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (29 - n))) & 0x1fffffff;
     }
-#else
-    sp_int_digit s;
-    sp_int_digit t;
-
-    s = (sp_int_digit)a[195];
-    r[196] = s >> (21U - n);
-    s = (sp_int_digit)(a[195]); t = (sp_int_digit)(a[194]);
-    r[195] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[194]); t = (sp_int_digit)(a[193]);
-    r[194] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[193]); t = (sp_int_digit)(a[192]);
-    r[193] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[192]); t = (sp_int_digit)(a[191]);
-    r[192] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[191]); t = (sp_int_digit)(a[190]);
-    r[191] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[190]); t = (sp_int_digit)(a[189]);
-    r[190] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[189]); t = (sp_int_digit)(a[188]);
-    r[189] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[188]); t = (sp_int_digit)(a[187]);
-    r[188] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[187]); t = (sp_int_digit)(a[186]);
-    r[187] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[186]); t = (sp_int_digit)(a[185]);
-    r[186] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[185]); t = (sp_int_digit)(a[184]);
-    r[185] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[184]); t = (sp_int_digit)(a[183]);
-    r[184] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[183]); t = (sp_int_digit)(a[182]);
-    r[183] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[182]); t = (sp_int_digit)(a[181]);
-    r[182] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[181]); t = (sp_int_digit)(a[180]);
-    r[181] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[180]); t = (sp_int_digit)(a[179]);
-    r[180] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[179]); t = (sp_int_digit)(a[178]);
-    r[179] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[178]); t = (sp_int_digit)(a[177]);
-    r[178] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[177]); t = (sp_int_digit)(a[176]);
-    r[177] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[176]); t = (sp_int_digit)(a[175]);
-    r[176] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[175]); t = (sp_int_digit)(a[174]);
-    r[175] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[174]); t = (sp_int_digit)(a[173]);
-    r[174] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[173]); t = (sp_int_digit)(a[172]);
-    r[173] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[172]); t = (sp_int_digit)(a[171]);
-    r[172] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[171]); t = (sp_int_digit)(a[170]);
-    r[171] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[170]); t = (sp_int_digit)(a[169]);
-    r[170] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[169]); t = (sp_int_digit)(a[168]);
-    r[169] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[168]); t = (sp_int_digit)(a[167]);
-    r[168] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[167]); t = (sp_int_digit)(a[166]);
-    r[167] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[166]); t = (sp_int_digit)(a[165]);
-    r[166] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[165]); t = (sp_int_digit)(a[164]);
-    r[165] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[164]); t = (sp_int_digit)(a[163]);
-    r[164] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[163]); t = (sp_int_digit)(a[162]);
-    r[163] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[162]); t = (sp_int_digit)(a[161]);
-    r[162] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[161]); t = (sp_int_digit)(a[160]);
-    r[161] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[160]); t = (sp_int_digit)(a[159]);
-    r[160] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[159]); t = (sp_int_digit)(a[158]);
-    r[159] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[158]); t = (sp_int_digit)(a[157]);
-    r[158] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[157]); t = (sp_int_digit)(a[156]);
-    r[157] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[156]); t = (sp_int_digit)(a[155]);
-    r[156] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[155]); t = (sp_int_digit)(a[154]);
-    r[155] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[154]); t = (sp_int_digit)(a[153]);
-    r[154] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[153]); t = (sp_int_digit)(a[152]);
-    r[153] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[152]); t = (sp_int_digit)(a[151]);
-    r[152] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[151]); t = (sp_int_digit)(a[150]);
-    r[151] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[150]); t = (sp_int_digit)(a[149]);
-    r[150] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[149]); t = (sp_int_digit)(a[148]);
-    r[149] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[148]); t = (sp_int_digit)(a[147]);
-    r[148] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[147]); t = (sp_int_digit)(a[146]);
-    r[147] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[146]); t = (sp_int_digit)(a[145]);
-    r[146] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[145]); t = (sp_int_digit)(a[144]);
-    r[145] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[144]); t = (sp_int_digit)(a[143]);
-    r[144] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[143]); t = (sp_int_digit)(a[142]);
-    r[143] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[142]); t = (sp_int_digit)(a[141]);
-    r[142] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[141]); t = (sp_int_digit)(a[140]);
-    r[141] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[140]); t = (sp_int_digit)(a[139]);
-    r[140] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[139]); t = (sp_int_digit)(a[138]);
-    r[139] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[138]); t = (sp_int_digit)(a[137]);
-    r[138] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[137]); t = (sp_int_digit)(a[136]);
-    r[137] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[136]); t = (sp_int_digit)(a[135]);
-    r[136] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[135]); t = (sp_int_digit)(a[134]);
-    r[135] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[134]); t = (sp_int_digit)(a[133]);
-    r[134] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[133]); t = (sp_int_digit)(a[132]);
-    r[133] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[132]); t = (sp_int_digit)(a[131]);
-    r[132] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[131]); t = (sp_int_digit)(a[130]);
-    r[131] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[130]); t = (sp_int_digit)(a[129]);
-    r[130] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[129]); t = (sp_int_digit)(a[128]);
-    r[129] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[128]); t = (sp_int_digit)(a[127]);
-    r[128] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[127]); t = (sp_int_digit)(a[126]);
-    r[127] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[126]); t = (sp_int_digit)(a[125]);
-    r[126] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[125]); t = (sp_int_digit)(a[124]);
-    r[125] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[124]); t = (sp_int_digit)(a[123]);
-    r[124] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[123]); t = (sp_int_digit)(a[122]);
-    r[123] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[122]); t = (sp_int_digit)(a[121]);
-    r[122] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[121]); t = (sp_int_digit)(a[120]);
-    r[121] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[120]); t = (sp_int_digit)(a[119]);
-    r[120] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[119]); t = (sp_int_digit)(a[118]);
-    r[119] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[118]); t = (sp_int_digit)(a[117]);
-    r[118] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[117]); t = (sp_int_digit)(a[116]);
-    r[117] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[116]); t = (sp_int_digit)(a[115]);
-    r[116] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[115]); t = (sp_int_digit)(a[114]);
-    r[115] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[114]); t = (sp_int_digit)(a[113]);
-    r[114] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[113]); t = (sp_int_digit)(a[112]);
-    r[113] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[112]); t = (sp_int_digit)(a[111]);
-    r[112] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[111]); t = (sp_int_digit)(a[110]);
-    r[111] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[110]); t = (sp_int_digit)(a[109]);
-    r[110] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[109]); t = (sp_int_digit)(a[108]);
-    r[109] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[108]); t = (sp_int_digit)(a[107]);
-    r[108] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[107]); t = (sp_int_digit)(a[106]);
-    r[107] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[106]); t = (sp_int_digit)(a[105]);
-    r[106] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[105]); t = (sp_int_digit)(a[104]);
-    r[105] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[104]); t = (sp_int_digit)(a[103]);
-    r[104] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[103]); t = (sp_int_digit)(a[102]);
-    r[103] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[102]); t = (sp_int_digit)(a[101]);
-    r[102] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[101]); t = (sp_int_digit)(a[100]);
-    r[101] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[100]); t = (sp_int_digit)(a[99]);
-    r[100] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[99]); t = (sp_int_digit)(a[98]);
-    r[99] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[98]); t = (sp_int_digit)(a[97]);
-    r[98] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[97]); t = (sp_int_digit)(a[96]);
-    r[97] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[96]); t = (sp_int_digit)(a[95]);
-    r[96] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[95]); t = (sp_int_digit)(a[94]);
-    r[95] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[94]); t = (sp_int_digit)(a[93]);
-    r[94] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[93]); t = (sp_int_digit)(a[92]);
-    r[93] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[92]); t = (sp_int_digit)(a[91]);
-    r[92] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[91]); t = (sp_int_digit)(a[90]);
-    r[91] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[90]); t = (sp_int_digit)(a[89]);
-    r[90] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[89]); t = (sp_int_digit)(a[88]);
-    r[89] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[88]); t = (sp_int_digit)(a[87]);
-    r[88] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[87]); t = (sp_int_digit)(a[86]);
-    r[87] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[86]); t = (sp_int_digit)(a[85]);
-    r[86] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[85]); t = (sp_int_digit)(a[84]);
-    r[85] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[84]); t = (sp_int_digit)(a[83]);
-    r[84] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[83]); t = (sp_int_digit)(a[82]);
-    r[83] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[82]); t = (sp_int_digit)(a[81]);
-    r[82] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[81]); t = (sp_int_digit)(a[80]);
-    r[81] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[80]); t = (sp_int_digit)(a[79]);
-    r[80] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[79]); t = (sp_int_digit)(a[78]);
-    r[79] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[78]); t = (sp_int_digit)(a[77]);
-    r[78] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[77]); t = (sp_int_digit)(a[76]);
-    r[77] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[76]); t = (sp_int_digit)(a[75]);
-    r[76] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[75]); t = (sp_int_digit)(a[74]);
-    r[75] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[74]); t = (sp_int_digit)(a[73]);
-    r[74] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[73]); t = (sp_int_digit)(a[72]);
-    r[73] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[72]); t = (sp_int_digit)(a[71]);
-    r[72] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[71]); t = (sp_int_digit)(a[70]);
-    r[71] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[70]); t = (sp_int_digit)(a[69]);
-    r[70] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[69]); t = (sp_int_digit)(a[68]);
-    r[69] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[68]); t = (sp_int_digit)(a[67]);
-    r[68] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[67]); t = (sp_int_digit)(a[66]);
-    r[67] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[66]); t = (sp_int_digit)(a[65]);
-    r[66] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[65]); t = (sp_int_digit)(a[64]);
-    r[65] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[64]); t = (sp_int_digit)(a[63]);
-    r[64] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[63]); t = (sp_int_digit)(a[62]);
-    r[63] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[62]); t = (sp_int_digit)(a[61]);
-    r[62] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[61]); t = (sp_int_digit)(a[60]);
-    r[61] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[60]); t = (sp_int_digit)(a[59]);
-    r[60] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[59]); t = (sp_int_digit)(a[58]);
-    r[59] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[58]); t = (sp_int_digit)(a[57]);
-    r[58] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[57]); t = (sp_int_digit)(a[56]);
-    r[57] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[56]); t = (sp_int_digit)(a[55]);
-    r[56] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[55]); t = (sp_int_digit)(a[54]);
-    r[55] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[54]); t = (sp_int_digit)(a[53]);
-    r[54] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[53]); t = (sp_int_digit)(a[52]);
-    r[53] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[52]); t = (sp_int_digit)(a[51]);
-    r[52] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[51]); t = (sp_int_digit)(a[50]);
-    r[51] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[50]); t = (sp_int_digit)(a[49]);
-    r[50] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[49]); t = (sp_int_digit)(a[48]);
-    r[49] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[48]); t = (sp_int_digit)(a[47]);
-    r[48] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[47]); t = (sp_int_digit)(a[46]);
-    r[47] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[46]); t = (sp_int_digit)(a[45]);
-    r[46] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[45]); t = (sp_int_digit)(a[44]);
-    r[45] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[44]); t = (sp_int_digit)(a[43]);
-    r[44] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[43]); t = (sp_int_digit)(a[42]);
-    r[43] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[42]); t = (sp_int_digit)(a[41]);
-    r[42] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[41]); t = (sp_int_digit)(a[40]);
-    r[41] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[40]); t = (sp_int_digit)(a[39]);
-    r[40] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[39]); t = (sp_int_digit)(a[38]);
-    r[39] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[38]); t = (sp_int_digit)(a[37]);
-    r[38] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[37]); t = (sp_int_digit)(a[36]);
-    r[37] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[36]); t = (sp_int_digit)(a[35]);
-    r[36] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[35]); t = (sp_int_digit)(a[34]);
-    r[35] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[34]); t = (sp_int_digit)(a[33]);
-    r[34] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[33]); t = (sp_int_digit)(a[32]);
-    r[33] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[32]); t = (sp_int_digit)(a[31]);
-    r[32] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[31]); t = (sp_int_digit)(a[30]);
-    r[31] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[30]); t = (sp_int_digit)(a[29]);
-    r[30] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
-    r[29] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
-    r[28] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
-    r[27] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
-    r[26] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
-    r[25] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
-    r[24] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
-    r[23] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
-    r[22] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
-    r[21] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
-    r[20] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
-    r[19] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
-    r[18] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
-    r[17] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
-    r[16] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
-    r[15] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
-    r[14] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
-    r[13] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
-    r[12] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
-    r[11] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
-    r[10] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
-    r[9] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
-    r[8] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
-    r[7] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
-    r[6] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
-    r[5] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
-    r[4] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
-    r[3] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
-    r[2] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
-    r[1] = ((s << n) | (t >> (21U - n))) & 0x1fffff;
-#endif
-    r[0] = (a[0] << n) & 0x1fffff;
+    r[0] = (a[0] << n) & 0x1fffffff;
 }
 
 /* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
@@ -12291,12 +15541,12 @@ SP_NOINLINE static void sp_4096_lshift_196(sp_digit* r, const sp_digit* a,
  * m     A single precision number that is the modulus.
  * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
  */
-static int sp_4096_mod_exp_2_196(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
+static int sp_4096_mod_exp_2_142(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
 {
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[589];
+    sp_digit td[427];
 #endif
     sp_digit* norm = NULL;
     sp_digit* tmp = NULL;
@@ -12309,7 +15559,7 @@ static int sp_4096_mod_exp_2_196(sp_digit* r, const sp_digit* e, int bits, const
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 589, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 427, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -12317,19 +15567,19 @@ static int sp_4096_mod_exp_2_196(sp_digit* r, const sp_digit* e, int bits, const
 
     if (err == MP_OKAY) {
         norm = td;
-        tmp  = td + 392;
-        XMEMSET(td, 0, sizeof(sp_digit) * 589);
+        tmp  = td + 284;
+        XMEMSET(td, 0, sizeof(sp_digit) * 427);
 
         sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_196(norm, m);
+        sp_4096_mont_norm_142(norm, m);
 
         bits = ((bits + 3) / 4) * 4;
-        i = ((bits + 20) / 21) - 1;
-        c = bits % 21;
+        i = ((bits + 28) / 29) - 1;
+        c = bits % 29;
         if (c == 0) {
-            c = 21;
+            c = 29;
         }
-        if (i < 196) {
+        if (i < 142) {
             n = e[i--] << (32 - c);
         }
         else {
@@ -12337,41 +15587,53 @@ static int sp_4096_mod_exp_2_196(sp_digit* r, const sp_digit* e, int bits, const
             i--;
         }
         if (c < 4) {
-            n |= e[i--] << (11 - c);
-            c += 21;
+            n |= e[i--] << (3 - c);
+            c += 29;
         }
         y = (int)((n >> 28) & 0xf);
         n <<= 4;
         c -= 4;
-        sp_4096_lshift_196(r, norm, (byte)y);
+        sp_4096_lshift_142(r, norm, (byte)y);
         while ((i >= 0) || (c >= 4)) {
-            if (c < 4) {
-                n |= e[i--] << (11 - c);
-                c += 21;
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 25;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 3;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 29 - c;
             }
-            y = (int)((n >> 28) & 0xf);
-            n <<= 4;
-            c -= 4;
 
-            sp_4096_mont_sqr_196(r, r, m, mp);
-            sp_4096_mont_sqr_196(r, r, m, mp);
-            sp_4096_mont_sqr_196(r, r, m, mp);
-            sp_4096_mont_sqr_196(r, r, m, mp);
+            sp_4096_mont_sqr_142(r, r, m, mp);
+            sp_4096_mont_sqr_142(r, r, m, mp);
+            sp_4096_mont_sqr_142(r, r, m, mp);
+            sp_4096_mont_sqr_142(r, r, m, mp);
 
-            sp_4096_lshift_196(r, r, (byte)y);
-            sp_4096_mul_d_196(tmp, norm, (r[196] << 20) + (r[195] >> 1));
-            r[196] = 0;
-            r[195] &= 0x1L;
-            (void)sp_4096_add_196(r, r, tmp);
-            sp_4096_norm_196(r);
-            o = sp_4096_cmp_196(r, m);
-            sp_4096_cond_sub_196(r, r, m, ((o < 0) ?
+            sp_4096_lshift_142(r, r, (byte)y);
+            sp_4096_mul_d_142(tmp, norm, (r[142] << 22) + (r[141] >> 7));
+            r[142] = 0;
+            r[141] &= 0x7fL;
+            (void)sp_4096_add_142(r, r, tmp);
+            sp_4096_norm_142(r);
+            o = sp_4096_cmp_142(r, m);
+            sp_4096_cond_sub_142(r, r, m, ((o < 0) ?
                                           (sp_digit)1 : (sp_digit)0) - 1);
         }
 
-        sp_4096_mont_reduce_196(r, m, mp);
-        n = sp_4096_cmp_196(r, m);
-        sp_4096_cond_sub_196(r, r, m, ((n < 0) ?
+        sp_4096_mont_reduce_142(r, m, mp);
+        n = sp_4096_cmp_142(r, m);
+        sp_4096_cond_sub_142(r, r, m, ((n < 0) ?
                                                 (sp_digit)1 : (sp_digit)0) - 1);
     }
 
@@ -12400,84 +15662,10 @@ static int sp_4096_mod_exp_2_196(sp_digit* r, const sp_digit* e, int bits, const
 int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     const mp_int* mod, byte* out, word32* outLen)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int err = MP_OKAY;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
-    sp_digit b[196 * 4];
-#endif
-    sp_digit* e = NULL;
-    sp_digit* m = NULL;
-    sp_digit* r = NULL;
-    word32 i;
-
-    if (mp_count_bits(base) > 4096) {
-        err = MP_READ_E;
-    }
-    else if (expLen > 512) {
-        err = MP_READ_E;
-    }
-    else if (mp_count_bits(mod) != 4096) {
-        err = MP_READ_E;
-    }
-    else if (mp_iseven(mod)) {
-        err = MP_VAL;
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 196 * 4, NULL, DYNAMIC_TYPE_DH);
-        if (b == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        e = b + 196 * 2;
-        m = e + 196;
-        r = b;
-
-        sp_4096_from_mp(b, 196, base);
-        sp_4096_from_bin(e, 196, exp, expLen);
-        sp_4096_from_mp(m, 196, mod);
-
-    #ifdef HAVE_FFDHE_4096
-        if (base->used == 1 && base->dp[0] == 2 &&
-                ((m[195] << 15) | (m[194] >> 6)) == 0xffffL) {
-            err = sp_4096_mod_exp_2_196(r, e, expLen * 8, m);
-        }
-        else
-    #endif
-            err = sp_4096_mod_exp_196(r, b, e, expLen * 8, m, 0);
-    }
-
-    if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
-        *outLen = 512;
-        for (i=0; i<512 && out[i] == 0; i++) {
-        }
-        *outLen -= i;
-        XMEMMOVE(out, out + i, *outLen);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (b != NULL)
-#endif
-    {
-        /* only "e" is sensitive and needs zeroized */
-        if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 196U);
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        XFREE(b, NULL, DYNAMIC_TYPE_DH);
-    #endif
-    }
-    return err;
-#else
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* b = NULL;
-#else
-    sp_digit b[196 * 4];
+    sp_digit b[142 * 4];
 #endif
     sp_digit* e = NULL;
     sp_digit* m = NULL;
@@ -12500,7 +15688,7 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 196 * 4, NULL,
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL,
             DYNAMIC_TYPE_DH);
         if (b == NULL)
             err = MEMORY_E;
@@ -12508,29 +15696,29 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
 #endif
 
     if (err == MP_OKAY) {
-        e = b + 196 * 2;
-        m = e + 196;
+        e = b + 142 * 2;
+        m = e + 142;
         r = b;
 
-        sp_4096_from_mp(b, 196, base);
-        sp_4096_from_bin(e, 196, exp, expLen);
-        sp_4096_from_mp(m, 196, mod);
+        sp_4096_from_mp(b, 142, base);
+        sp_4096_from_bin(e, 142, exp, expLen);
+        sp_4096_from_mp(m, 142, mod);
 
     #ifdef HAVE_FFDHE_4096
         if (base->used == 1 && base->dp[0] == 2U &&
-                ((m[195] << 15) | (m[194] >> 6)) == 0xffffL) {
-            err = sp_4096_mod_exp_2_196(r, e, expLen * 8U, m);
+                ((m[141] << 9) | (m[140] >> 20)) == 0xffffL) {
+            err = sp_4096_mod_exp_2_142(r, e, expLen * 8U, m);
         }
         else {
     #endif
-            err = sp_4096_mod_exp_196(r, b, e, expLen * 8U, m, 0);
+            err = sp_4096_mod_exp_142(r, b, e, expLen * 8U, m, 0);
     #ifdef HAVE_FFDHE_4096
         }
     #endif
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_142(r, out);
         *outLen = 512;
         for (i=0; i<512U && out[i] == 0U; i++) {
             /* Search for first non-zero. */
@@ -12545,497 +15733,61 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     {
         /* only "e" is sensitive and needs zeroized */
         if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 196U);
+            ForceZero(e, sizeof(sp_digit) * 142U);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(b, NULL, DYNAMIC_TYPE_DH);
     #endif
     }
 
     return err;
-#endif
 }
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
 
-#endif /* WOLFSSL_SP_4096 */
-
-#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */
-#ifdef WOLFSSL_HAVE_SP_ECC
-#ifndef WOLFSSL_SP_NO_256
-
-/* Point structure to use. */
-typedef struct sp_point_256 {
-    /* X ordinate of point. */
-    sp_digit x[2 * 10];
-    /* Y ordinate of point. */
-    sp_digit y[2 * 10];
-    /* Z ordinate of point. */
-    sp_digit z[2 * 10];
-    /* Indicates point is at infinity. */
-    int infinity;
-} sp_point_256;
-
-/* The modulus (prime) of the curve P256. */
-static const sp_digit p256_mod[10] = {
-    0x3ffffff,0x3ffffff,0x3ffffff,0x003ffff,0x0000000,0x0000000,0x0000000,
-    0x0000400,0x3ff0000,0x03fffff
-};
-/* The Montogmery normalizer for modulus of the curve P256. */
-static const sp_digit p256_norm_mod[10] = {
-    0x0000001,0x0000000,0x0000000,0x3fc0000,0x3ffffff,0x3ffffff,0x3ffffff,
-    0x3fffbff,0x000ffff,0x0000000
-};
-/* The Montogmery multiplier for modulus of the curve P256. */
-static const sp_digit p256_mp_mod = 0x000001;
-#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
-                                            defined(HAVE_ECC_VERIFY)
-/* The order of the curve P256. */
-static const sp_digit p256_order[10] = {
-    0x0632551,0x272b0bf,0x1e84f3b,0x2b69c5e,0x3bce6fa,0x3ffffff,0x3ffffff,
-    0x00003ff,0x3ff0000,0x03fffff
-};
-#endif
-/* The order of the curve P256 minus 2. */
-static const sp_digit p256_order2[10] = {
-    0x063254f,0x272b0bf,0x1e84f3b,0x2b69c5e,0x3bce6fa,0x3ffffff,0x3ffffff,
-    0x00003ff,0x3ff0000,0x03fffff
-};
-#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
-/* The Montogmery normalizer for order of the curve P256. */
-static const sp_digit p256_norm_order[10] = {
-    0x39cdaaf,0x18d4f40,0x217b0c4,0x14963a1,0x0431905,0x0000000,0x0000000,
-    0x3fffc00,0x000ffff,0x0000000
-};
-#endif
-#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
-/* The Montogmery multiplier for order of the curve P256. */
-static const sp_digit p256_mp_order = 0x200bc4f;
-#endif
-/* The base point of curve P256. */
-static const sp_point_256 p256_base = {
-    /* X ordinate */
-    {
-        0x098c296,0x04e5176,0x33a0f4a,0x204b7ac,0x277037d,0x0e9103c,0x3ce6e56,
-        0x1091fe2,0x1f2e12c,0x01ac5f4,
-        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
-        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
-    },
-    /* Y ordinate */
-    {
-        0x3bf51f5,0x1901a0d,0x1ececbb,0x15dacc5,0x22bce33,0x303e785,0x27eb4a7,
-        0x1fe6e3b,0x2e2fe1a,0x013f8d0,
-        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
-        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
-    },
-    /* Z ordinate */
-    {
-        0x0000001,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,
-        0x0000000,0x0000000,0x0000000,
-        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
-        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
-    },
-    /* infinity */
-    0
-};
-#if defined(HAVE_ECC_CHECK_KEY) || defined(HAVE_COMP_KEY)
-static const sp_digit p256_b[10] = {
-    0x3d2604b,0x38f0f89,0x30f63bc,0x2c3314e,0x0651d06,0x1a621af,0x2bbd557,
-    0x24f9ecf,0x1d8aa3a,0x016b18d
-};
-#endif
-
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
+#else
+/* Read big endian unsigned byte array into r.
  *
  * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
  */
-SP_NOINLINE static void sp_256_mul_10(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
+static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
 {
     int i;
-    int j;
-    int k;
-    int64_t c;
+    int j = 0;
+    word32 s = 0;
 
-    c = ((int64_t)a[9]) * b[9];
-    r[19] = (sp_digit)(c >> 26);
-    c = (c & 0x3ffffff) << 26;
-    for (k = 17; k >= 0; k--) {
-        for (i = 9; i >= 0; i--) {
-            j = k - i;
-            if (j >= 10) {
+    r[0] = 0;
+    for (i = n-1; i >= 0; i--) {
+        r[j] |= (((sp_digit)a[i]) << s);
+        if (s >= 18U) {
+            r[j] &= 0x3ffffff;
+            s = 26U - s;
+            if (j + 1 >= size) {
                 break;
             }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
+            r[++j] = (sp_digit)a[i] >> s;
+            s = 8U - s;
         }
-        r[k + 2] += (sp_digit)(c >> 52);
-        r[k + 1] = (sp_digit)((c >> 26) & 0x3ffffff);
-        c = (c & 0x3ffffff) << 26;
-    }
-    r[0] = (sp_digit)(c >> 26);
-}
-
-#else
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_256_mul_10(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int64_t t0   = ((int64_t)a[ 0]) * b[ 0];
-    int64_t t1   = ((int64_t)a[ 0]) * b[ 1]
-                 + ((int64_t)a[ 1]) * b[ 0];
-    int64_t t2   = ((int64_t)a[ 0]) * b[ 2]
-                 + ((int64_t)a[ 1]) * b[ 1]
-                 + ((int64_t)a[ 2]) * b[ 0];
-    int64_t t3   = ((int64_t)a[ 0]) * b[ 3]
-                 + ((int64_t)a[ 1]) * b[ 2]
-                 + ((int64_t)a[ 2]) * b[ 1]
-                 + ((int64_t)a[ 3]) * b[ 0];
-    int64_t t4   = ((int64_t)a[ 0]) * b[ 4]
-                 + ((int64_t)a[ 1]) * b[ 3]
-                 + ((int64_t)a[ 2]) * b[ 2]
-                 + ((int64_t)a[ 3]) * b[ 1]
-                 + ((int64_t)a[ 4]) * b[ 0];
-    int64_t t5   = ((int64_t)a[ 0]) * b[ 5]
-                 + ((int64_t)a[ 1]) * b[ 4]
-                 + ((int64_t)a[ 2]) * b[ 3]
-                 + ((int64_t)a[ 3]) * b[ 2]
-                 + ((int64_t)a[ 4]) * b[ 1]
-                 + ((int64_t)a[ 5]) * b[ 0];
-    int64_t t6   = ((int64_t)a[ 0]) * b[ 6]
-                 + ((int64_t)a[ 1]) * b[ 5]
-                 + ((int64_t)a[ 2]) * b[ 4]
-                 + ((int64_t)a[ 3]) * b[ 3]
-                 + ((int64_t)a[ 4]) * b[ 2]
-                 + ((int64_t)a[ 5]) * b[ 1]
-                 + ((int64_t)a[ 6]) * b[ 0];
-    int64_t t7   = ((int64_t)a[ 0]) * b[ 7]
-                 + ((int64_t)a[ 1]) * b[ 6]
-                 + ((int64_t)a[ 2]) * b[ 5]
-                 + ((int64_t)a[ 3]) * b[ 4]
-                 + ((int64_t)a[ 4]) * b[ 3]
-                 + ((int64_t)a[ 5]) * b[ 2]
-                 + ((int64_t)a[ 6]) * b[ 1]
-                 + ((int64_t)a[ 7]) * b[ 0];
-    int64_t t8   = ((int64_t)a[ 0]) * b[ 8]
-                 + ((int64_t)a[ 1]) * b[ 7]
-                 + ((int64_t)a[ 2]) * b[ 6]
-                 + ((int64_t)a[ 3]) * b[ 5]
-                 + ((int64_t)a[ 4]) * b[ 4]
-                 + ((int64_t)a[ 5]) * b[ 3]
-                 + ((int64_t)a[ 6]) * b[ 2]
-                 + ((int64_t)a[ 7]) * b[ 1]
-                 + ((int64_t)a[ 8]) * b[ 0];
-    int64_t t9   = ((int64_t)a[ 0]) * b[ 9]
-                 + ((int64_t)a[ 1]) * b[ 8]
-                 + ((int64_t)a[ 2]) * b[ 7]
-                 + ((int64_t)a[ 3]) * b[ 6]
-                 + ((int64_t)a[ 4]) * b[ 5]
-                 + ((int64_t)a[ 5]) * b[ 4]
-                 + ((int64_t)a[ 6]) * b[ 3]
-                 + ((int64_t)a[ 7]) * b[ 2]
-                 + ((int64_t)a[ 8]) * b[ 1]
-                 + ((int64_t)a[ 9]) * b[ 0];
-    int64_t t10  = ((int64_t)a[ 1]) * b[ 9]
-                 + ((int64_t)a[ 2]) * b[ 8]
-                 + ((int64_t)a[ 3]) * b[ 7]
-                 + ((int64_t)a[ 4]) * b[ 6]
-                 + ((int64_t)a[ 5]) * b[ 5]
-                 + ((int64_t)a[ 6]) * b[ 4]
-                 + ((int64_t)a[ 7]) * b[ 3]
-                 + ((int64_t)a[ 8]) * b[ 2]
-                 + ((int64_t)a[ 9]) * b[ 1];
-    int64_t t11  = ((int64_t)a[ 2]) * b[ 9]
-                 + ((int64_t)a[ 3]) * b[ 8]
-                 + ((int64_t)a[ 4]) * b[ 7]
-                 + ((int64_t)a[ 5]) * b[ 6]
-                 + ((int64_t)a[ 6]) * b[ 5]
-                 + ((int64_t)a[ 7]) * b[ 4]
-                 + ((int64_t)a[ 8]) * b[ 3]
-                 + ((int64_t)a[ 9]) * b[ 2];
-    int64_t t12  = ((int64_t)a[ 3]) * b[ 9]
-                 + ((int64_t)a[ 4]) * b[ 8]
-                 + ((int64_t)a[ 5]) * b[ 7]
-                 + ((int64_t)a[ 6]) * b[ 6]
-                 + ((int64_t)a[ 7]) * b[ 5]
-                 + ((int64_t)a[ 8]) * b[ 4]
-                 + ((int64_t)a[ 9]) * b[ 3];
-    int64_t t13  = ((int64_t)a[ 4]) * b[ 9]
-                 + ((int64_t)a[ 5]) * b[ 8]
-                 + ((int64_t)a[ 6]) * b[ 7]
-                 + ((int64_t)a[ 7]) * b[ 6]
-                 + ((int64_t)a[ 8]) * b[ 5]
-                 + ((int64_t)a[ 9]) * b[ 4];
-    int64_t t14  = ((int64_t)a[ 5]) * b[ 9]
-                 + ((int64_t)a[ 6]) * b[ 8]
-                 + ((int64_t)a[ 7]) * b[ 7]
-                 + ((int64_t)a[ 8]) * b[ 6]
-                 + ((int64_t)a[ 9]) * b[ 5];
-    int64_t t15  = ((int64_t)a[ 6]) * b[ 9]
-                 + ((int64_t)a[ 7]) * b[ 8]
-                 + ((int64_t)a[ 8]) * b[ 7]
-                 + ((int64_t)a[ 9]) * b[ 6];
-    int64_t t16  = ((int64_t)a[ 7]) * b[ 9]
-                 + ((int64_t)a[ 8]) * b[ 8]
-                 + ((int64_t)a[ 9]) * b[ 7];
-    int64_t t17  = ((int64_t)a[ 8]) * b[ 9]
-                 + ((int64_t)a[ 9]) * b[ 8];
-    int64_t t18  = ((int64_t)a[ 9]) * b[ 9];
-
-    t1   += t0  >> 26; r[ 0] = t0  & 0x3ffffff;
-    t2   += t1  >> 26; r[ 1] = t1  & 0x3ffffff;
-    t3   += t2  >> 26; r[ 2] = t2  & 0x3ffffff;
-    t4   += t3  >> 26; r[ 3] = t3  & 0x3ffffff;
-    t5   += t4  >> 26; r[ 4] = t4  & 0x3ffffff;
-    t6   += t5  >> 26; r[ 5] = t5  & 0x3ffffff;
-    t7   += t6  >> 26; r[ 6] = t6  & 0x3ffffff;
-    t8   += t7  >> 26; r[ 7] = t7  & 0x3ffffff;
-    t9   += t8  >> 26; r[ 8] = t8  & 0x3ffffff;
-    t10  += t9  >> 26; r[ 9] = t9  & 0x3ffffff;
-    t11  += t10 >> 26; r[10] = t10 & 0x3ffffff;
-    t12  += t11 >> 26; r[11] = t11 & 0x3ffffff;
-    t13  += t12 >> 26; r[12] = t12 & 0x3ffffff;
-    t14  += t13 >> 26; r[13] = t13 & 0x3ffffff;
-    t15  += t14 >> 26; r[14] = t14 & 0x3ffffff;
-    t16  += t15 >> 26; r[15] = t15 & 0x3ffffff;
-    t17  += t16 >> 26; r[16] = t16 & 0x3ffffff;
-    t18  += t17 >> 26; r[17] = t17 & 0x3ffffff;
-    r[19] = (sp_digit)(t18 >> 26);
-                       r[18] = t18 & 0x3ffffff;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_256_sqr_10(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int64_t c;
-
-    c = ((int64_t)a[9]) * a[9];
-    r[19] = (sp_digit)(c >> 26);
-    c = (c & 0x3ffffff) << 26;
-    for (k = 17; k >= 0; k--) {
-        for (i = 9; i >= 0; i--) {
-            j = k - i;
-            if (j >= 10 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * a[j] * 2;
+        else {
+            s += 8U;
         }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 52);
-        r[k + 1] = (sp_digit)((c >> 26) & 0x3ffffff);
-        c = (c & 0x3ffffff) << 26;
-    }
-    r[0] = (sp_digit)(c >> 26);
-}
-
-#else
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_256_sqr_10(sp_digit* r, const sp_digit* a)
-{
-    int64_t t0   =  ((int64_t)a[ 0]) * a[ 0];
-    int64_t t1   = (((int64_t)a[ 0]) * a[ 1]) * 2;
-    int64_t t2   = (((int64_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int64_t)a[ 1]) * a[ 1];
-    int64_t t3   = (((int64_t)a[ 0]) * a[ 3]
-                 +  ((int64_t)a[ 1]) * a[ 2]) * 2;
-    int64_t t4   = (((int64_t)a[ 0]) * a[ 4]
-                 +  ((int64_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int64_t)a[ 2]) * a[ 2];
-    int64_t t5   = (((int64_t)a[ 0]) * a[ 5]
-                 +  ((int64_t)a[ 1]) * a[ 4]
-                 +  ((int64_t)a[ 2]) * a[ 3]) * 2;
-    int64_t t6   = (((int64_t)a[ 0]) * a[ 6]
-                 +  ((int64_t)a[ 1]) * a[ 5]
-                 +  ((int64_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int64_t)a[ 3]) * a[ 3];
-    int64_t t7   = (((int64_t)a[ 0]) * a[ 7]
-                 +  ((int64_t)a[ 1]) * a[ 6]
-                 +  ((int64_t)a[ 2]) * a[ 5]
-                 +  ((int64_t)a[ 3]) * a[ 4]) * 2;
-    int64_t t8   = (((int64_t)a[ 0]) * a[ 8]
-                 +  ((int64_t)a[ 1]) * a[ 7]
-                 +  ((int64_t)a[ 2]) * a[ 6]
-                 +  ((int64_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int64_t)a[ 4]) * a[ 4];
-    int64_t t9   = (((int64_t)a[ 0]) * a[ 9]
-                 +  ((int64_t)a[ 1]) * a[ 8]
-                 +  ((int64_t)a[ 2]) * a[ 7]
-                 +  ((int64_t)a[ 3]) * a[ 6]
-                 +  ((int64_t)a[ 4]) * a[ 5]) * 2;
-    int64_t t10  = (((int64_t)a[ 1]) * a[ 9]
-                 +  ((int64_t)a[ 2]) * a[ 8]
-                 +  ((int64_t)a[ 3]) * a[ 7]
-                 +  ((int64_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int64_t)a[ 5]) * a[ 5];
-    int64_t t11  = (((int64_t)a[ 2]) * a[ 9]
-                 +  ((int64_t)a[ 3]) * a[ 8]
-                 +  ((int64_t)a[ 4]) * a[ 7]
-                 +  ((int64_t)a[ 5]) * a[ 6]) * 2;
-    int64_t t12  = (((int64_t)a[ 3]) * a[ 9]
-                 +  ((int64_t)a[ 4]) * a[ 8]
-                 +  ((int64_t)a[ 5]) * a[ 7]) * 2
-                 +  ((int64_t)a[ 6]) * a[ 6];
-    int64_t t13  = (((int64_t)a[ 4]) * a[ 9]
-                 +  ((int64_t)a[ 5]) * a[ 8]
-                 +  ((int64_t)a[ 6]) * a[ 7]) * 2;
-    int64_t t14  = (((int64_t)a[ 5]) * a[ 9]
-                 +  ((int64_t)a[ 6]) * a[ 8]) * 2
-                 +  ((int64_t)a[ 7]) * a[ 7];
-    int64_t t15  = (((int64_t)a[ 6]) * a[ 9]
-                 +  ((int64_t)a[ 7]) * a[ 8]) * 2;
-    int64_t t16  = (((int64_t)a[ 7]) * a[ 9]) * 2
-                 +  ((int64_t)a[ 8]) * a[ 8];
-    int64_t t17  = (((int64_t)a[ 8]) * a[ 9]) * 2;
-    int64_t t18  =  ((int64_t)a[ 9]) * a[ 9];
-
-    t1   += t0  >> 26; r[ 0] = t0  & 0x3ffffff;
-    t2   += t1  >> 26; r[ 1] = t1  & 0x3ffffff;
-    t3   += t2  >> 26; r[ 2] = t2  & 0x3ffffff;
-    t4   += t3  >> 26; r[ 3] = t3  & 0x3ffffff;
-    t5   += t4  >> 26; r[ 4] = t4  & 0x3ffffff;
-    t6   += t5  >> 26; r[ 5] = t5  & 0x3ffffff;
-    t7   += t6  >> 26; r[ 6] = t6  & 0x3ffffff;
-    t8   += t7  >> 26; r[ 7] = t7  & 0x3ffffff;
-    t9   += t8  >> 26; r[ 8] = t8  & 0x3ffffff;
-    t10  += t9  >> 26; r[ 9] = t9  & 0x3ffffff;
-    t11  += t10 >> 26; r[10] = t10 & 0x3ffffff;
-    t12  += t11 >> 26; r[11] = t11 & 0x3ffffff;
-    t13  += t12 >> 26; r[12] = t12 & 0x3ffffff;
-    t14  += t13 >> 26; r[13] = t13 & 0x3ffffff;
-    t15  += t14 >> 26; r[14] = t14 & 0x3ffffff;
-    t16  += t15 >> 26; r[15] = t15 & 0x3ffffff;
-    t17  += t16 >> 26; r[16] = t16 & 0x3ffffff;
-    t18  += t17 >> 26; r[17] = t17 & 0x3ffffff;
-    r[19] = (sp_digit)(t18 >> 26);
-                       r[18] = t18 & 0x3ffffff;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_256_add_10(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 10; i++) {
-        r[i] = a[i] + b[i];
     }
 
-    return 0;
-}
-#else
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_256_add_10(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    r[ 0] = a[ 0] + b[ 0];
-    r[ 1] = a[ 1] + b[ 1];
-    r[ 2] = a[ 2] + b[ 2];
-    r[ 3] = a[ 3] + b[ 3];
-    r[ 4] = a[ 4] + b[ 4];
-    r[ 5] = a[ 5] + b[ 5];
-    r[ 6] = a[ 6] + b[ 6];
-    r[ 7] = a[ 7] + b[ 7];
-    r[ 8] = a[ 8] + b[ 8];
-    r[ 9] = a[ 9] + b[ 9];
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_256_sub_10(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 10; i++) {
-        r[i] = a[i] - b[i];
+    for (j++; j < size; j++) {
+        r[j] = 0;
     }
-
-    return 0;
 }
 
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_256_sub_10(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    r[ 0] = a[ 0] - b[ 0];
-    r[ 1] = a[ 1] - b[ 1];
-    r[ 2] = a[ 2] - b[ 2];
-    r[ 3] = a[ 3] - b[ 3];
-    r[ 4] = a[ 4] - b[ 4];
-    r[ 5] = a[ 5] - b[ 5];
-    r[ 6] = a[ 6] - b[ 6];
-    r[ 7] = a[ 7] - b[ 7];
-    r[ 8] = a[ 8] - b[ 8];
-    r[ 9] = a[ 9] - b[ 9];
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
 /* Convert an mp_int to an array of sp_digit.
  *
  * r  A single precision integer.
  * size  Maximum number of bytes to convert
  * a  A multi-precision integer.
  */
-static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a)
+static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
 {
 #if DIGIT_BIT == 26
     int j;
@@ -13114,4408 +15866,26 @@ static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a)
 #endif
 }
 
-/* Convert a point of type ecc_point to type sp_point_256.
- *
- * p   Point of type sp_point_256 (result).
- * pm  Point of type ecc_point.
- */
-static void sp_256_point_from_ecc_point_10(sp_point_256* p,
-        const ecc_point* pm)
-{
-    XMEMSET(p->x, 0, sizeof(p->x));
-    XMEMSET(p->y, 0, sizeof(p->y));
-    XMEMSET(p->z, 0, sizeof(p->z));
-    sp_256_from_mp(p->x, 10, pm->x);
-    sp_256_from_mp(p->y, 10, pm->y);
-    sp_256_from_mp(p->z, 10, pm->z);
-    p->infinity = 0;
-}
-
-/* Convert an array of sp_digit to an mp_int.
- *
- * a  A single precision integer.
- * r  A multi-precision integer.
- */
-static int sp_256_to_mp(const sp_digit* a, mp_int* r)
-{
-    int err;
-
-    err = mp_grow(r, (256 + DIGIT_BIT - 1) / DIGIT_BIT);
-    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
-#if DIGIT_BIT == 26
-        XMEMCPY(r->dp, a, sizeof(sp_digit) * 10);
-        r->used = 10;
-        mp_clamp(r);
-#elif DIGIT_BIT < 26
-        int i;
-        int j = 0;
-        int s = 0;
-
-        r->dp[0] = 0;
-        for (i = 0; i < 10; i++) {
-            r->dp[j] |= (mp_digit)(a[i] << s);
-            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
-            s = DIGIT_BIT - s;
-            r->dp[++j] = (mp_digit)(a[i] >> s);
-            while (s + DIGIT_BIT <= 26) {
-                s += DIGIT_BIT;
-                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
-                if (s == SP_WORD_SIZE) {
-                    r->dp[j] = 0;
-                }
-                else {
-                    r->dp[j] = (mp_digit)(a[i] >> s);
-                }
-            }
-            s = 26 - s;
-        }
-        r->used = (256 + DIGIT_BIT - 1) / DIGIT_BIT;
-        mp_clamp(r);
-#else
-        int i;
-        int j = 0;
-        int s = 0;
-
-        r->dp[0] = 0;
-        for (i = 0; i < 10; i++) {
-            r->dp[j] |= ((mp_digit)a[i]) << s;
-            if (s + 26 >= DIGIT_BIT) {
-    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
-                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
-    #endif
-                s = DIGIT_BIT - s;
-                r->dp[++j] = a[i] >> s;
-                s = 26 - s;
-            }
-            else {
-                s += 26;
-            }
-        }
-        r->used = (256 + DIGIT_BIT - 1) / DIGIT_BIT;
-        mp_clamp(r);
-#endif
-    }
-
-    return err;
-}
-
-/* Convert a point of type sp_point_256 to type ecc_point.
- *
- * p   Point of type sp_point_256.
- * pm  Point of type ecc_point (result).
- * returns MEMORY_E when allocation of memory in ecc_point fails otherwise
- * MP_OKAY.
- */
-static int sp_256_point_to_ecc_point_10(const sp_point_256* p, ecc_point* pm)
-{
-    int err;
-
-    err = sp_256_to_mp(p->x, pm->x);
-    if (err == MP_OKAY) {
-        err = sp_256_to_mp(p->y, pm->y);
-    }
-    if (err == MP_OKAY) {
-        err = sp_256_to_mp(p->z, pm->z);
-    }
-
-    return err;
-}
-
-#define sp_256_mont_reduce_order_10         sp_256_mont_reduce_10
-
-/* Compare a with b in constant time.
- *
- * a  A single precision integer.
- * b  A single precision integer.
- * return -ve, 0 or +ve if a is less than, equal to or greater than b
- * respectively.
- */
-static sp_digit sp_256_cmp_10(const sp_digit* a, const sp_digit* b)
-{
-    sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=9; i>=0; i--) {
-        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#else
-    r |= (a[ 9] - b[ 9]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 8] - b[ 8]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 7] - b[ 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 6] - b[ 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 5] - b[ 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 4] - b[ 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 3] - b[ 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 2] - b[ 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 1] - b[ 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    r |= (a[ 0] - b[ 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-#endif /* WOLFSSL_SP_SMALL */
-
-    return r;
-}
-
-/* Conditionally subtract b from a using the mask m.
- * m is -1 to subtract and 0 when not.
- *
- * r  A single precision number representing condition subtract result.
- * a  A single precision number to subtract from.
- * b  A single precision number to subtract.
- * m  Mask value to apply.
- */
-static void sp_256_cond_sub_10(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit m)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 10; i++) {
-        r[i] = a[i] - (b[i] & m);
-    }
-#else
-    r[ 0] = a[ 0] - (b[ 0] & m);
-    r[ 1] = a[ 1] - (b[ 1] & m);
-    r[ 2] = a[ 2] - (b[ 2] & m);
-    r[ 3] = a[ 3] - (b[ 3] & m);
-    r[ 4] = a[ 4] - (b[ 4] & m);
-    r[ 5] = a[ 5] - (b[ 5] & m);
-    r[ 6] = a[ 6] - (b[ 6] & m);
-    r[ 7] = a[ 7] - (b[ 7] & m);
-    r[ 8] = a[ 8] - (b[ 8] & m);
-    r[ 9] = a[ 9] - (b[ 9] & m);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Mul a by scalar b and add into r. (r += a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A scalar.
- */
-SP_NOINLINE static void sp_256_mul_add_10(sp_digit* r, const sp_digit* a,
-        const sp_digit b)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
-    int i;
-
-    for (i = 0; i < 10; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x3ffffff;
-        t >>= 26;
-    }
-    r[10] += (sp_digit)t;
-#else
-    int64_t tb = b;
-    int64_t t[10];
-
-    t[ 0] = tb * a[ 0];
-    t[ 1] = tb * a[ 1];
-    t[ 2] = tb * a[ 2];
-    t[ 3] = tb * a[ 3];
-    t[ 4] = tb * a[ 4];
-    t[ 5] = tb * a[ 5];
-    t[ 6] = tb * a[ 6];
-    t[ 7] = tb * a[ 7];
-    t[ 8] = tb * a[ 8];
-    t[ 9] = tb * a[ 9];
-    r[ 0] += (sp_digit)                 (t[ 0] & 0x3ffffff);
-    r[ 1] += (sp_digit)((t[ 0] >> 26) + (t[ 1] & 0x3ffffff));
-    r[ 2] += (sp_digit)((t[ 1] >> 26) + (t[ 2] & 0x3ffffff));
-    r[ 3] += (sp_digit)((t[ 2] >> 26) + (t[ 3] & 0x3ffffff));
-    r[ 4] += (sp_digit)((t[ 3] >> 26) + (t[ 4] & 0x3ffffff));
-    r[ 5] += (sp_digit)((t[ 4] >> 26) + (t[ 5] & 0x3ffffff));
-    r[ 6] += (sp_digit)((t[ 5] >> 26) + (t[ 6] & 0x3ffffff));
-    r[ 7] += (sp_digit)((t[ 6] >> 26) + (t[ 7] & 0x3ffffff));
-    r[ 8] += (sp_digit)((t[ 7] >> 26) + (t[ 8] & 0x3ffffff));
-    r[ 9] += (sp_digit)((t[ 8] >> 26) + (t[ 9] & 0x3ffffff));
-    r[10] += (sp_digit) (t[ 9] >> 26);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 26.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_256_norm_10(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 9; i++) {
-        a[i+1] += a[i] >> 26;
-        a[i] &= 0x3ffffff;
-    }
-#else
-    a[1] += a[0] >> 26; a[0] &= 0x3ffffff;
-    a[2] += a[1] >> 26; a[1] &= 0x3ffffff;
-    a[3] += a[2] >> 26; a[2] &= 0x3ffffff;
-    a[4] += a[3] >> 26; a[3] &= 0x3ffffff;
-    a[5] += a[4] >> 26; a[4] &= 0x3ffffff;
-    a[6] += a[5] >> 26; a[5] &= 0x3ffffff;
-    a[7] += a[6] >> 26; a[6] &= 0x3ffffff;
-    a[8] += a[7] >> 26; a[7] &= 0x3ffffff;
-    a[9] += a[8] >> 26; a[8] &= 0x3ffffff;
-#endif
-}
-
-/* Shift the result in the high 256 bits down to the bottom.
- *
- * r  A single precision number.
- * a  A single precision number.
- */
-static void sp_256_mont_shift_10(sp_digit* r, const sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    sp_digit n;
-    sp_digit s;
-
-    s = a[10];
-    n = a[9] >> 22;
-    for (i = 0; i < 9; i++) {
-        n += (s & 0x3ffffff) << 4;
-        r[i] = n & 0x3ffffff;
-        n >>= 26;
-        s = a[11 + i] + (s >> 26);
-    }
-    n += s << 4;
-    r[9] = n;
-#else
-    sp_digit n;
-    sp_digit s;
-
-    s = a[10]; n = a[9] >> 22;
-    n += (s & 0x3ffffff) << 4; r[ 0] = n & 0x3ffffff;
-    n >>= 26; s = a[11] + (s >> 26);
-    n += (s & 0x3ffffff) << 4; r[ 1] = n & 0x3ffffff;
-    n >>= 26; s = a[12] + (s >> 26);
-    n += (s & 0x3ffffff) << 4; r[ 2] = n & 0x3ffffff;
-    n >>= 26; s = a[13] + (s >> 26);
-    n += (s & 0x3ffffff) << 4; r[ 3] = n & 0x3ffffff;
-    n >>= 26; s = a[14] + (s >> 26);
-    n += (s & 0x3ffffff) << 4; r[ 4] = n & 0x3ffffff;
-    n >>= 26; s = a[15] + (s >> 26);
-    n += (s & 0x3ffffff) << 4; r[ 5] = n & 0x3ffffff;
-    n >>= 26; s = a[16] + (s >> 26);
-    n += (s & 0x3ffffff) << 4; r[ 6] = n & 0x3ffffff;
-    n >>= 26; s = a[17] + (s >> 26);
-    n += (s & 0x3ffffff) << 4; r[ 7] = n & 0x3ffffff;
-    n >>= 26; s = a[18] + (s >> 26);
-    n += (s & 0x3ffffff) << 4; r[ 8] = n & 0x3ffffff;
-    n >>= 26; s = a[19] + (s >> 26);
-    n += s << 4;              r[ 9] = n;
-#endif /* WOLFSSL_SP_SMALL */
-    XMEMSET(&r[10], 0, sizeof(*r) * 10U);
-}
-
-/* Reduce the number back to 256 bits using Montgomery reduction.
- *
- * a   A single precision number to reduce in place.
- * m   The single precision number representing the modulus.
- * mp  The digit representing the negative inverse of m mod 2^n.
- */
-static void sp_256_mont_reduce_10(sp_digit* a, const sp_digit* m, sp_digit mp)
-{
-    int i;
-    sp_digit mu;
-
-    if (mp != 1) {
-        for (i=0; i<9; i++) {
-            mu = (a[i] * mp) & 0x3ffffff;
-            sp_256_mul_add_10(a+i, m, mu);
-            a[i+1] += a[i] >> 26;
-        }
-        mu = (a[i] * mp) & 0x3fffffL;
-        sp_256_mul_add_10(a+i, m, mu);
-        a[i+1] += a[i] >> 26;
-        a[i] &= 0x3ffffff;
-    }
-    else {
-        for (i=0; i<9; i++) {
-            mu = a[i] & 0x3ffffff;
-            sp_256_mul_add_10(a+i, p256_mod, mu);
-            a[i+1] += a[i] >> 26;
-        }
-        mu = a[i] & 0x3fffffL;
-        sp_256_mul_add_10(a+i, p256_mod, mu);
-        a[i+1] += a[i] >> 26;
-        a[i] &= 0x3ffffff;
-    }
-
-    sp_256_mont_shift_10(a, a);
-    sp_256_cond_sub_10(a, a, m, 0 - (((a[9] >> 22) > 0) ?
-            (sp_digit)1 : (sp_digit)0));
-    sp_256_norm_10(a);
-}
-
-/* Multiply two Montogmery form numbers mod the modulus (prime).
- * (r = a * b mod m)
- *
- * r   Result of multiplication.
- * a   First number to multiply in Montogmery form.
- * b   Second number to multiply in Montogmery form.
- * m   Modulus (prime).
- * mp  Montogmery mulitplier.
- */
-static void sp_256_mont_mul_10(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m, sp_digit mp)
-{
-    sp_256_mul_10(r, a, b);
-    sp_256_mont_reduce_10(r, m, mp);
-}
-
-/* Square the Montgomery form number. (r = a * a mod m)
- *
- * r   Result of squaring.
- * a   Number to square in Montogmery form.
- * m   Modulus (prime).
- * mp  Montogmery mulitplier.
- */
-static void sp_256_mont_sqr_10(sp_digit* r, const sp_digit* a,
-        const sp_digit* m, sp_digit mp)
-{
-    sp_256_sqr_10(r, a);
-    sp_256_mont_reduce_10(r, m, mp);
-}
-
-#if !defined(WOLFSSL_SP_SMALL) || defined(HAVE_COMP_KEY)
-/* Square the Montgomery form number a number of times. (r = a ^ n mod m)
- *
- * r   Result of squaring.
- * a   Number to square in Montogmery form.
- * n   Number of times to square.
- * m   Modulus (prime).
- * mp  Montogmery mulitplier.
- */
-static void sp_256_mont_sqr_n_10(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
-{
-    sp_256_mont_sqr_10(r, a, m, mp);
-    for (; n > 1; n--) {
-        sp_256_mont_sqr_10(r, r, m, mp);
-    }
-}
-
-#endif /* !WOLFSSL_SP_SMALL | HAVE_COMP_KEY */
-#ifdef WOLFSSL_SP_SMALL
-/* Mod-2 for the P256 curve. */
-static const uint32_t p256_mod_minus_2[8] = {
-    0xfffffffdU,0xffffffffU,0xffffffffU,0x00000000U,0x00000000U,0x00000000U,
-    0x00000001U,0xffffffffU
-};
-#endif /* !WOLFSSL_SP_SMALL */
-
-/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
- * P256 curve. (r = 1 / a mod m)
- *
- * r   Inverse result.
- * a   Number to invert.
- * td  Temporary data.
- */
-static void sp_256_mont_inv_10(sp_digit* r, const sp_digit* a, sp_digit* td)
-{
-#ifdef WOLFSSL_SP_SMALL
-    sp_digit* t = td;
-    int i;
-
-    XMEMCPY(t, a, sizeof(sp_digit) * 10);
-    for (i=254; i>=0; i--) {
-        sp_256_mont_sqr_10(t, t, p256_mod, p256_mp_mod);
-        if (p256_mod_minus_2[i / 32] & ((sp_digit)1 << (i % 32)))
-            sp_256_mont_mul_10(t, t, a, p256_mod, p256_mp_mod);
-    }
-    XMEMCPY(r, t, sizeof(sp_digit) * 10);
-#else
-    sp_digit* t1 = td;
-    sp_digit* t2 = td + 2 * 10;
-    sp_digit* t3 = td + 4 * 10;
-    /* 0x2 */
-    sp_256_mont_sqr_10(t1, a, p256_mod, p256_mp_mod);
-    /* 0x3 */
-    sp_256_mont_mul_10(t2, t1, a, p256_mod, p256_mp_mod);
-    /* 0xc */
-    sp_256_mont_sqr_n_10(t1, t2, 2, p256_mod, p256_mp_mod);
-    /* 0xd */
-    sp_256_mont_mul_10(t3, t1, a, p256_mod, p256_mp_mod);
-    /* 0xf */
-    sp_256_mont_mul_10(t2, t2, t1, p256_mod, p256_mp_mod);
-    /* 0xf0 */
-    sp_256_mont_sqr_n_10(t1, t2, 4, p256_mod, p256_mp_mod);
-    /* 0xfd */
-    sp_256_mont_mul_10(t3, t3, t1, p256_mod, p256_mp_mod);
-    /* 0xff */
-    sp_256_mont_mul_10(t2, t2, t1, p256_mod, p256_mp_mod);
-    /* 0xff00 */
-    sp_256_mont_sqr_n_10(t1, t2, 8, p256_mod, p256_mp_mod);
-    /* 0xfffd */
-    sp_256_mont_mul_10(t3, t3, t1, p256_mod, p256_mp_mod);
-    /* 0xffff */
-    sp_256_mont_mul_10(t2, t2, t1, p256_mod, p256_mp_mod);
-    /* 0xffff0000 */
-    sp_256_mont_sqr_n_10(t1, t2, 16, p256_mod, p256_mp_mod);
-    /* 0xfffffffd */
-    sp_256_mont_mul_10(t3, t3, t1, p256_mod, p256_mp_mod);
-    /* 0xffffffff */
-    sp_256_mont_mul_10(t2, t2, t1, p256_mod, p256_mp_mod);
-    /* 0xffffffff00000000 */
-    sp_256_mont_sqr_n_10(t1, t2, 32, p256_mod, p256_mp_mod);
-    /* 0xffffffffffffffff */
-    sp_256_mont_mul_10(t2, t2, t1, p256_mod, p256_mp_mod);
-    /* 0xffffffff00000001 */
-    sp_256_mont_mul_10(r, t1, a, p256_mod, p256_mp_mod);
-    /* 0xffffffff000000010000000000000000000000000000000000000000 */
-    sp_256_mont_sqr_n_10(r, r, 160, p256_mod, p256_mp_mod);
-    /* 0xffffffff00000001000000000000000000000000ffffffffffffffff */
-    sp_256_mont_mul_10(r, r, t2, p256_mod, p256_mp_mod);
-    /* 0xffffffff00000001000000000000000000000000ffffffffffffffff00000000 */
-    sp_256_mont_sqr_n_10(r, r, 32, p256_mod, p256_mp_mod);
-    /* 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffd */
-    sp_256_mont_mul_10(r, r, t3, p256_mod, p256_mp_mod);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Map the Montgomery form projective coordinate point to an affine point.
- *
- * r  Resulting affine coordinate point.
- * p  Montgomery form projective coordinate point.
- * t  Temporary ordinate data.
- */
-static void sp_256_map_10(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*10;
-    int32_t n;
-
-    sp_256_mont_inv_10(t1, p->z, t + 2*10);
-
-    sp_256_mont_sqr_10(t2, t1, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(t1, t2, t1, p256_mod, p256_mp_mod);
-
-    /* x /= z^2 */
-    sp_256_mont_mul_10(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 10, 0, sizeof(r->x) / 2U);
-    sp_256_mont_reduce_10(r->x, p256_mod, p256_mp_mod);
-    /* Reduce x to less than modulus */
-    n = sp_256_cmp_10(r->x, p256_mod);
-    sp_256_cond_sub_10(r->x, r->x, p256_mod, 0 - ((n >= 0) ?
-                (sp_digit)1 : (sp_digit)0));
-    sp_256_norm_10(r->x);
-
-    /* y /= z^3 */
-    sp_256_mont_mul_10(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 10, 0, sizeof(r->y) / 2U);
-    sp_256_mont_reduce_10(r->y, p256_mod, p256_mp_mod);
-    /* Reduce y to less than modulus */
-    n = sp_256_cmp_10(r->y, p256_mod);
-    sp_256_cond_sub_10(r->y, r->y, p256_mod, 0 - ((n >= 0) ?
-                (sp_digit)1 : (sp_digit)0));
-    sp_256_norm_10(r->y);
-
-    XMEMSET(r->z, 0, sizeof(r->z));
-    r->z[0] = 1;
-
-}
-
-/* Add two Montgomery form numbers (r = a + b % m).
- *
- * r   Result of addition.
- * a   First number to add in Montogmery form.
- * b   Second number to add in Montogmery form.
- * m   Modulus (prime).
- */
-static void sp_256_mont_add_10(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
-{
-    (void)sp_256_add_10(r, a, b);
-    sp_256_norm_10(r);
-    sp_256_cond_sub_10(r, r, m, 0 - (((r[9] >> 22) > 0) ?
-                (sp_digit)1 : (sp_digit)0));
-    sp_256_norm_10(r);
-}
-
-/* Double a Montgomery form number (r = a + a % m).
- *
- * r   Result of doubling.
- * a   Number to double in Montogmery form.
- * m   Modulus (prime).
- */
-static void sp_256_mont_dbl_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
-{
-    (void)sp_256_add_10(r, a, a);
-    sp_256_norm_10(r);
-    sp_256_cond_sub_10(r, r, m, 0 - (((r[9] >> 22) > 0) ?
-                (sp_digit)1 : (sp_digit)0));
-    sp_256_norm_10(r);
-}
-
-/* Triple a Montgomery form number (r = a + a + a % m).
- *
- * r   Result of Tripling.
- * a   Number to triple in Montogmery form.
- * m   Modulus (prime).
- */
-static void sp_256_mont_tpl_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
-{
-    (void)sp_256_add_10(r, a, a);
-    sp_256_norm_10(r);
-    sp_256_cond_sub_10(r, r, m, 0 - (((r[9] >> 22) > 0) ?
-                (sp_digit)1 : (sp_digit)0));
-    sp_256_norm_10(r);
-    (void)sp_256_add_10(r, r, a);
-    sp_256_norm_10(r);
-    sp_256_cond_sub_10(r, r, m, 0 - (((r[9] >> 22) > 0) ?
-                (sp_digit)1 : (sp_digit)0));
-    sp_256_norm_10(r);
-}
-
-/* Conditionally add a and b using the mask m.
- * m is -1 to add and 0 when not.
- *
- * r  A single precision number representing conditional add result.
- * a  A single precision number to add with.
- * b  A single precision number to add.
- * m  Mask value to apply.
- */
-static void sp_256_cond_add_10(sp_digit* r, const sp_digit* a,
-        const sp_digit* b, const sp_digit m)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 10; i++) {
-        r[i] = a[i] + (b[i] & m);
-    }
-#else
-    r[ 0] = a[ 0] + (b[ 0] & m);
-    r[ 1] = a[ 1] + (b[ 1] & m);
-    r[ 2] = a[ 2] + (b[ 2] & m);
-    r[ 3] = a[ 3] + (b[ 3] & m);
-    r[ 4] = a[ 4] + (b[ 4] & m);
-    r[ 5] = a[ 5] + (b[ 5] & m);
-    r[ 6] = a[ 6] + (b[ 6] & m);
-    r[ 7] = a[ 7] + (b[ 7] & m);
-    r[ 8] = a[ 8] + (b[ 8] & m);
-    r[ 9] = a[ 9] + (b[ 9] & m);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Subtract two Montgomery form numbers (r = a - b % m).
- *
- * r   Result of subtration.
- * a   Number to subtract from in Montogmery form.
- * b   Number to subtract with in Montogmery form.
- * m   Modulus (prime).
- */
-static void sp_256_mont_sub_10(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
-{
-    (void)sp_256_sub_10(r, a, b);
-    sp_256_norm_10(r);
-    sp_256_cond_add_10(r, r, m, r[9] >> 22);
-    sp_256_norm_10(r);
-}
-
-/* Shift number left one bit.
- * Bottom bit is lost.
- *
- * r  Result of shift.
- * a  Number to shift.
- */
-SP_NOINLINE static void sp_256_rshift1_10(sp_digit* r, const sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=0; i<9; i++) {
-        r[i] = (a[i] >> 1) + ((a[i + 1] << 25) & 0x3ffffff);
-    }
-#else
-    r[0] = (a[0] >> 1) + ((a[1] << 25) & 0x3ffffff);
-    r[1] = (a[1] >> 1) + ((a[2] << 25) & 0x3ffffff);
-    r[2] = (a[2] >> 1) + ((a[3] << 25) & 0x3ffffff);
-    r[3] = (a[3] >> 1) + ((a[4] << 25) & 0x3ffffff);
-    r[4] = (a[4] >> 1) + ((a[5] << 25) & 0x3ffffff);
-    r[5] = (a[5] >> 1) + ((a[6] << 25) & 0x3ffffff);
-    r[6] = (a[6] >> 1) + ((a[7] << 25) & 0x3ffffff);
-    r[7] = (a[7] >> 1) + ((a[8] << 25) & 0x3ffffff);
-    r[8] = (a[8] >> 1) + ((a[9] << 25) & 0x3ffffff);
-#endif
-    r[9] = a[9] >> 1;
-}
-
-/* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
- *
- * r  Result of division by 2.
- * a  Number to divide.
- * m  Modulus (prime).
- */
-static void sp_256_div2_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
-{
-    sp_256_cond_add_10(r, a, m, 0 - (a[0] & 1));
-    sp_256_norm_10(r);
-    sp_256_rshift1_10(r, r);
-}
-
-/* Double the Montgomery form projective point p.
- *
- * r  Result of doubling point.
- * p  Point to double.
- * t  Temporary ordinate data.
- */
-#ifdef WOLFSSL_SP_NONBLOCK
-typedef struct sp_256_proj_point_dbl_10_ctx {
-    int state;
-    sp_digit* t1;
-    sp_digit* t2;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-} sp_256_proj_point_dbl_10_ctx;
-
-static int sp_256_proj_point_dbl_10_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
-{
-    int err = FP_WOULDBLOCK;
-    sp_256_proj_point_dbl_10_ctx* ctx = (sp_256_proj_point_dbl_10_ctx*)sp_ctx->data;
-
-    typedef char ctx_size_test[sizeof(sp_256_proj_point_dbl_10_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
-    (void)sizeof(ctx_size_test);
-
-    switch (ctx->state) {
-    case 0:
-        ctx->t1 = t;
-        ctx->t2 = t + 2*10;
-        ctx->x = r->x;
-        ctx->y = r->y;
-        ctx->z = r->z;
-
-        /* Put infinity into result. */
-        if (r != p) {
-            r->infinity = p->infinity;
-        }
-        ctx->state = 1;
-        break;
-    case 1:
-        /* T1 = Z * Z */
-        sp_256_mont_sqr_10(ctx->t1, p->z, p256_mod, p256_mp_mod);
-        ctx->state = 2;
-        break;
-    case 2:
-        /* Z = Y * Z */
-        sp_256_mont_mul_10(ctx->z, p->y, p->z, p256_mod, p256_mp_mod);
-        ctx->state = 3;
-        break;
-    case 3:
-        /* Z = 2Z */
-        sp_256_mont_dbl_10(ctx->z, ctx->z, p256_mod);
-        ctx->state = 4;
-        break;
-    case 4:
-        /* T2 = X - T1 */
-        sp_256_mont_sub_10(ctx->t2, p->x, ctx->t1, p256_mod);
-        ctx->state = 5;
-        break;
-    case 5:
-        /* T1 = X + T1 */
-        sp_256_mont_add_10(ctx->t1, p->x, ctx->t1, p256_mod);
-        ctx->state = 6;
-        break;
-    case 6:
-        /* T2 = T1 * T2 */
-        sp_256_mont_mul_10(ctx->t2, ctx->t1, ctx->t2, p256_mod, p256_mp_mod);
-        ctx->state = 7;
-        break;
-    case 7:
-        /* T1 = 3T2 */
-        sp_256_mont_tpl_10(ctx->t1, ctx->t2, p256_mod);
-        ctx->state = 8;
-        break;
-    case 8:
-        /* Y = 2Y */
-        sp_256_mont_dbl_10(ctx->y, p->y, p256_mod);
-        ctx->state = 9;
-        break;
-    case 9:
-        /* Y = Y * Y */
-        sp_256_mont_sqr_10(ctx->y, ctx->y, p256_mod, p256_mp_mod);
-        ctx->state = 10;
-        break;
-    case 10:
-        /* T2 = Y * Y */
-        sp_256_mont_sqr_10(ctx->t2, ctx->y, p256_mod, p256_mp_mod);
-        ctx->state = 11;
-        break;
-    case 11:
-        /* T2 = T2/2 */
-        sp_256_div2_10(ctx->t2, ctx->t2, p256_mod);
-        ctx->state = 12;
-        break;
-    case 12:
-        /* Y = Y * X */
-        sp_256_mont_mul_10(ctx->y, ctx->y, p->x, p256_mod, p256_mp_mod);
-        ctx->state = 13;
-        break;
-    case 13:
-        /* X = T1 * T1 */
-        sp_256_mont_sqr_10(ctx->x, ctx->t1, p256_mod, p256_mp_mod);
-        ctx->state = 14;
-        break;
-    case 14:
-        /* X = X - Y */
-        sp_256_mont_sub_10(ctx->x, ctx->x, ctx->y, p256_mod);
-        ctx->state = 15;
-        break;
-    case 15:
-        /* X = X - Y */
-        sp_256_mont_sub_10(ctx->x, ctx->x, ctx->y, p256_mod);
-        ctx->state = 16;
-        break;
-    case 16:
-        /* Y = Y - X */
-        sp_256_mont_sub_10(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 17;
-        break;
-    case 17:
-        /* Y = Y * T1 */
-        sp_256_mont_mul_10(ctx->y, ctx->y, ctx->t1, p256_mod, p256_mp_mod);
-        ctx->state = 18;
-        break;
-    case 18:
-        /* Y = Y - T2 */
-        sp_256_mont_sub_10(ctx->y, ctx->y, ctx->t2, p256_mod);
-        ctx->state = 19;
-        /* fall-through */
-    case 19:
-        err = MP_OKAY;
-        break;
-    }
-
-    if (err == MP_OKAY && ctx->state != 19) {
-        err = FP_WOULDBLOCK;
-    }
-
-    return err;
-}
-#endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_10(sp_point_256* r, const sp_point_256* p, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*10;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_10(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_10(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_10(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_10(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_10(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_10(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_10(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_10(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_10(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_10(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_10(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_10(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_10(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_10(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_10(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_10(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_10(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_10(y, y, t2, p256_mod);
-}
-
-/* Compare two numbers to determine if they are equal.
- * Constant time implementation.
- *
- * a  First number to compare.
- * b  Second number to compare.
- * returns 1 when equal and 0 otherwise.
- */
-static int sp_256_cmp_equal_10(const sp_digit* a, const sp_digit* b)
-{
-    return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) |
-            (a[3] ^ b[3]) | (a[4] ^ b[4]) | (a[5] ^ b[5]) |
-            (a[6] ^ b[6]) | (a[7] ^ b[7]) | (a[8] ^ b[8]) |
-            (a[9] ^ b[9])) == 0;
-}
-
-/* Add two Montgomery form projective points.
- *
- * r  Result of addition.
- * p  First point to add.
- * q  Second point to add.
- * t  Temporary ordinate data.
- */
-
-#ifdef WOLFSSL_SP_NONBLOCK
-typedef struct sp_256_proj_point_add_10_ctx {
-    int state;
-    sp_256_proj_point_dbl_10_ctx dbl_ctx;
-    const sp_point_256* ap[2];
-    sp_point_256* rp[2];
-    sp_digit* t1;
-    sp_digit* t2;
-    sp_digit* t3;
-    sp_digit* t4;
-    sp_digit* t5;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-} sp_256_proj_point_add_10_ctx;
-
-static int sp_256_proj_point_add_10_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
-    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    int err = FP_WOULDBLOCK;
-    sp_256_proj_point_add_10_ctx* ctx = (sp_256_proj_point_add_10_ctx*)sp_ctx->data;
-
-    /* Ensure only the first point is the same as the result. */
-    if (q == r) {
-        const sp_point_256* a = p;
-        p = q;
-        q = a;
-    }
-
-    typedef char ctx_size_test[sizeof(sp_256_proj_point_add_10_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
-    (void)sizeof(ctx_size_test);
-
-    switch (ctx->state) {
-    case 0: /* INIT */
-        ctx->t1 = t;
-        ctx->t2 = t + 2*10;
-        ctx->t3 = t + 4*10;
-        ctx->t4 = t + 6*10;
-        ctx->t5 = t + 8*10;
-
-        ctx->state = 1;
-        break;
-    case 1:
-        /* Check double */
-        (void)sp_256_sub_10(ctx->t1, p256_mod, q->y);
-        sp_256_norm_10(ctx->t1);
-        if ((sp_256_cmp_equal_10(p->x, q->x) & sp_256_cmp_equal_10(p->z, q->z) &
-            (sp_256_cmp_equal_10(p->y, q->y) | sp_256_cmp_equal_10(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
-        break;
-    case 2:
-        err = sp_256_proj_point_dbl_10_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
-        break;
-    case 3:
-    {
-        int i;
-        ctx->rp[0] = r;
-
-        /*lint allow cast to different type of pointer*/
-        ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/
-        XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256));
-        ctx->x = ctx->rp[p->infinity | q->infinity]->x;
-        ctx->y = ctx->rp[p->infinity | q->infinity]->y;
-        ctx->z = ctx->rp[p->infinity | q->infinity]->z;
-
-        ctx->ap[0] = p;
-        ctx->ap[1] = q;
-        for (i=0; i<10; i++) {
-            r->x[i] = ctx->ap[p->infinity]->x[i];
-        }
-        for (i=0; i<10; i++) {
-            r->y[i] = ctx->ap[p->infinity]->y[i];
-        }
-        for (i=0; i<10; i++) {
-            r->z[i] = ctx->ap[p->infinity]->z[i];
-        }
-        r->infinity = ctx->ap[p->infinity]->infinity;
-
-        ctx->state = 4;
-        break;
-    }
-    case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_10(ctx->t1, q->z, p256_mod, p256_mp_mod);
-        ctx->state = 5;
-        break;
-    case 5:
-        sp_256_mont_mul_10(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
-        ctx->state = 6;
-        break;
-    case 6:
-        sp_256_mont_mul_10(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod);
-        ctx->state = 7;
-        break;
-    case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_10(ctx->t2, ctx->z, p256_mod, p256_mp_mod);
-        ctx->state = 8;
-        break;
-    case 8:
-        sp_256_mont_mul_10(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod);
-        ctx->state = 9;
-        break;
-    case 9:
-        sp_256_mont_mul_10(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
-        break;
-    case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_10(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod);
-        ctx->state = 11;
-        break;
-    case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_10(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
-        ctx->state = 12;
-        break;
-    case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_10(ctx->t2, ctx->t2, ctx->t1, p256_mod);
-        ctx->state = 13;
-        break;
-    case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_10(ctx->t4, ctx->t4, ctx->t3, p256_mod);
-        ctx->state = 14;
-        break;
-    case 14:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_10(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
-        ctx->state = 15;
-        break;
-    case 15:
-        sp_256_mont_mul_10(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod);
-        ctx->state = 16;
-        break;
-    case 16:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_10(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 17;
-        break;
-    case 17:
-        sp_256_mont_sqr_10(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
-        ctx->state = 18;
-        break;
-    case 18:
-        sp_256_mont_mul_10(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
-        ctx->state = 19;
-        break;
-    case 19:
-        sp_256_mont_mul_10(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
-        ctx->state = 20;
-        break;
-    case 20:
-        sp_256_mont_sub_10(ctx->x, ctx->x, ctx->t5, p256_mod);
-        ctx->state = 21;
-        break;
-    case 21:
-        sp_256_mont_dbl_10(ctx->t1, ctx->y, p256_mod);
-        ctx->state = 22;
-        break;
-    case 22:
-        sp_256_mont_sub_10(ctx->x, ctx->x, ctx->t1, p256_mod);
-        ctx->state = 23;
-        break;
-    case 23:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_10(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 24;
-        break;
-    case 24:
-        sp_256_mont_mul_10(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_10(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_10(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
-        err = MP_OKAY;
-        break;
-    }
-
-    if (err == MP_OKAY && ctx->state != 27) {
-        err = FP_WOULDBLOCK;
-    }
-    return err;
-}
-#endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_add_10(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    const sp_point_256* ap[2];
-    sp_point_256* rp[2];
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*10;
-    sp_digit* t3 = t + 4*10;
-    sp_digit* t4 = t + 6*10;
-    sp_digit* t5 = t + 8*10;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-    int i;
-
-    /* Ensure only the first point is the same as the result. */
-    if (q == r) {
-        const sp_point_256* a = p;
-        p = q;
-        q = a;
-    }
-
-    /* Check double */
-    (void)sp_256_sub_10(t1, p256_mod, q->y);
-    sp_256_norm_10(t1);
-    if ((sp_256_cmp_equal_10(p->x, q->x) & sp_256_cmp_equal_10(p->z, q->z) &
-        (sp_256_cmp_equal_10(p->y, q->y) | sp_256_cmp_equal_10(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_10(r, p, t);
-    }
-    else {
-        rp[0] = r;
-
-        /*lint allow cast to different type of pointer*/
-        rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/
-        XMEMSET(rp[1], 0, sizeof(sp_point_256));
-        x = rp[p->infinity | q->infinity]->x;
-        y = rp[p->infinity | q->infinity]->y;
-        z = rp[p->infinity | q->infinity]->z;
-
-        ap[0] = p;
-        ap[1] = q;
-        for (i=0; i<10; i++) {
-            r->x[i] = ap[p->infinity]->x[i];
-        }
-        for (i=0; i<10; i++) {
-            r->y[i] = ap[p->infinity]->y[i];
-        }
-        for (i=0; i<10; i++) {
-            r->z[i] = ap[p->infinity]->z[i];
-        }
-        r->infinity = ap[p->infinity]->infinity;
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_10(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t1, t1, x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_10(t2, z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t4, t2, z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_10(t3, t3, y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_10(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_10(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_10(t4, t4, t3, p256_mod);
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_10(z, z, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(z, z, t2, p256_mod, p256_mp_mod);
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_10(x, t4, p256_mod, p256_mp_mod);
-        sp_256_mont_sqr_10(t5, t2, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(y, t1, t5, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t5, t5, t2, p256_mod, p256_mp_mod);
-        sp_256_mont_sub_10(x, x, t5, p256_mod);
-        sp_256_mont_dbl_10(t1, y, p256_mod);
-        sp_256_mont_sub_10(x, x, t1, p256_mod);
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_10(y, y, x, p256_mod);
-        sp_256_mont_mul_10(y, y, t4, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t5, t5, t3, p256_mod, p256_mp_mod);
-        sp_256_mont_sub_10(y, y, t5, p256_mod);
-    }
-}
-
-/* Multiply a number by Montogmery normalizer mod modulus (prime).
- *
- * r  The resulting Montgomery form number.
- * a  The number to convert.
- * m  The modulus (prime).
- * returns MEMORY_E when memory allocation fails and MP_OKAY otherwise.
- */
-static int sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    int64_t* t = NULL;
-#else
-    int64_t t[2 * 8];
-#endif
-    int64_t* a32 = NULL;
-    int64_t o;
-    int err = MP_OKAY;
-
-    (void)m;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t = (int64_t*)XMALLOC(sizeof(int64_t) * 2 * 8, NULL, DYNAMIC_TYPE_ECC);
-    if (t == NULL)
-        return MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        a32 = t + 8;
-
-        a32[0] = a[0];
-        a32[0] |= a[1] << 26U;
-        a32[0] &= 0xffffffffL;
-        a32[1] = (a[1] >> 6);
-        a32[1] |= a[2] << 20U;
-        a32[1] &= 0xffffffffL;
-        a32[2] = (a[2] >> 12);
-        a32[2] |= a[3] << 14U;
-        a32[2] &= 0xffffffffL;
-        a32[3] = (a[3] >> 18);
-        a32[3] |= a[4] << 8U;
-        a32[3] &= 0xffffffffL;
-        a32[4] = (a[4] >> 24);
-        a32[4] |= a[5] << 2U;
-        a32[4] |= a[6] << 28U;
-        a32[4] &= 0xffffffffL;
-        a32[5] = (a[6] >> 4);
-        a32[5] |= a[7] << 22U;
-        a32[5] &= 0xffffffffL;
-        a32[6] = (a[7] >> 10);
-        a32[6] |= a[8] << 16U;
-        a32[6] &= 0xffffffffL;
-        a32[7] = (a[8] >> 16);
-        a32[7] |= a[9] << 10U;
-        a32[7] &= 0xffffffffL;
-
-        /*  1  1  0 -1 -1 -1 -1  0 */
-        t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6];
-        /*  0  1  1  0 -1 -1 -1 -1 */
-        t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7];
-        /*  0  0  1  1  0 -1 -1 -1 */
-        t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7];
-        /* -1 -1  0  2  2  1  0 -1 */
-        t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7];
-        /*  0 -1 -1  0  2  2  1  0 */
-        t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6];
-        /*  0  0 -1 -1  0  2  2  1 */
-        t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7];
-        /* -1 -1  0  0  0  1  3  2 */
-        t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7];
-        /*  1  0 -1 -1 -1 -1  0  3 */
-        t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7];
-
-        t[1] += t[0] >> 32U; t[0] &= 0xffffffffL;
-        t[2] += t[1] >> 32U; t[1] &= 0xffffffffL;
-        t[3] += t[2] >> 32U; t[2] &= 0xffffffffL;
-        t[4] += t[3] >> 32U; t[3] &= 0xffffffffL;
-        t[5] += t[4] >> 32U; t[4] &= 0xffffffffL;
-        t[6] += t[5] >> 32U; t[5] &= 0xffffffffL;
-        t[7] += t[6] >> 32U; t[6] &= 0xffffffffL;
-        o     = t[7] >> 32U; t[7] &= 0xffffffffL;
-        t[0] += o;
-        t[3] -= o;
-        t[6] -= o;
-        t[7] += o;
-        t[1] += t[0] >> 32U; t[0] &= 0xffffffffL;
-        t[2] += t[1] >> 32U; t[1] &= 0xffffffffL;
-        t[3] += t[2] >> 32U; t[2] &= 0xffffffffL;
-        t[4] += t[3] >> 32U; t[3] &= 0xffffffffL;
-        t[5] += t[4] >> 32U; t[4] &= 0xffffffffL;
-        t[6] += t[5] >> 32U; t[5] &= 0xffffffffL;
-        t[7] += t[6] >> 32U; t[6] &= 0xffffffffL;
-
-        r[0] = (sp_digit)(t[0]) & 0x3ffffffL;
-        r[1] = (sp_digit)(t[0] >> 26U);
-        r[1] |= (sp_digit)(t[1] << 6U);
-        r[1] &= 0x3ffffffL;
-        r[2] = (sp_digit)(t[1] >> 20U);
-        r[2] |= (sp_digit)(t[2] << 12U);
-        r[2] &= 0x3ffffffL;
-        r[3] = (sp_digit)(t[2] >> 14U);
-        r[3] |= (sp_digit)(t[3] << 18U);
-        r[3] &= 0x3ffffffL;
-        r[4] = (sp_digit)(t[3] >> 8U);
-        r[4] |= (sp_digit)(t[4] << 24U);
-        r[4] &= 0x3ffffffL;
-        r[5] = (sp_digit)(t[4] >> 2U) & 0x3ffffffL;
-        r[6] = (sp_digit)(t[4] >> 28U);
-        r[6] |= (sp_digit)(t[5] << 4U);
-        r[6] &= 0x3ffffffL;
-        r[7] = (sp_digit)(t[5] >> 22U);
-        r[7] |= (sp_digit)(t[6] << 10U);
-        r[7] &= 0x3ffffffL;
-        r[8] = (sp_digit)(t[6] >> 16U);
-        r[8] |= (sp_digit)(t[7] << 16U);
-        r[8] &= 0x3ffffffL;
-        r[9] = (sp_digit)(t[7] >> 10U);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (t != NULL)
-        XFREE(t, NULL, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
-}
-
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply the point by the scalar and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * Small implementation using add and double that is cache attack resistant but
- * allocates memory rather than use large stacks.
- * 256 adds and doubles.
- *
- * r     Resulting point.
- * g     Point to multiply.
- * k     Scalar to multiply by.
- * map   Indicates whether to convert result to affine.
- * ct    Constant time required.
- * heap  Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-
-#ifdef WOLFSSL_SP_NONBLOCK
-typedef struct sp_256_ecc_mulmod_10_ctx {
-    int state;
-    union {
-        sp_256_proj_point_dbl_10_ctx dbl_ctx;
-        sp_256_proj_point_add_10_ctx add_ctx;
-    };
-    sp_point_256 t[3];
-    sp_digit tmp[2 * 10 * 5];
-    sp_digit n;
-    int i;
-    int c;
-    int y;
-} sp_256_ecc_mulmod_10_ctx;
-
-static int sp_256_ecc_mulmod_10_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
-    const sp_point_256* g, const sp_digit* k, int map, int ct, void* heap)
-{
-    int err = FP_WOULDBLOCK;
-    sp_256_ecc_mulmod_10_ctx* ctx = (sp_256_ecc_mulmod_10_ctx*)sp_ctx->data;
-
-    typedef char ctx_size_test[sizeof(sp_256_ecc_mulmod_10_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
-    (void)sizeof(ctx_size_test);
-
-    /* Implementation is constant time. */
-    (void)ct;
-
-    switch (ctx->state) {
-    case 0: /* INIT */
-        XMEMSET(ctx->t, 0, sizeof(sp_point_256) * 3);
-        ctx->i = 9;
-        ctx->c = 22;
-        ctx->n = k[ctx->i--] << (26 - ctx->c);
-
-        /* t[0] = {0, 0, 1} * norm */
-        ctx->t[0].infinity = 1;
-        ctx->state = 1;
-        break;
-    case 1: /* T1X */
-        /* t[1] = {g->x, g->y, g->z} * norm */
-        err = sp_256_mod_mul_norm_10(ctx->t[1].x, g->x, p256_mod);
-        ctx->state = 2;
-        break;
-    case 2: /* T1Y */
-        err = sp_256_mod_mul_norm_10(ctx->t[1].y, g->y, p256_mod);
-        ctx->state = 3;
-        break;
-    case 3: /* T1Z */
-        err = sp_256_mod_mul_norm_10(ctx->t[1].z, g->z, p256_mod);
-        ctx->state = 4;
-        break;
-    case 4: /* ADDPREP */
-        if (ctx->c == 0) {
-            if (ctx->i == -1) {
-                ctx->state = 7;
-                break;
-            }
-
-            ctx->n = k[ctx->i--];
-            ctx->c = 26;
-        }
-        ctx->y = (ctx->n >> 25) & 1;
-        ctx->n <<= 1;
-        XMEMSET(&ctx->add_ctx, 0, sizeof(ctx->add_ctx));
-        ctx->state = 5;
-        break;
-    case 5: /* ADD */
-        err = sp_256_proj_point_add_10_nb((sp_ecc_ctx_t*)&ctx->add_ctx,
-            &ctx->t[ctx->y^1], &ctx->t[0], &ctx->t[1], ctx->tmp);
-        if (err == MP_OKAY) {
-            XMEMCPY(&ctx->t[2], (void*)(((size_t)&ctx->t[0] & addr_mask[ctx->y^1]) +
-                                        ((size_t)&ctx->t[1] & addr_mask[ctx->y])),
-                    sizeof(sp_point_256));
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 6;
-        }
-        break;
-    case 6: /* DBL */
-        err = sp_256_proj_point_dbl_10_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, &ctx->t[2],
-            &ctx->t[2], ctx->tmp);
-        if (err == MP_OKAY) {
-            XMEMCPY((void*)(((size_t)&ctx->t[0] & addr_mask[ctx->y^1]) +
-                            ((size_t)&ctx->t[1] & addr_mask[ctx->y])), &ctx->t[2],
-                    sizeof(sp_point_256));
-            ctx->state = 4;
-            ctx->c--;
-        }
-        break;
-    case 7: /* MAP */
-        if (map != 0) {
-            sp_256_map_10(r, &ctx->t[0], ctx->tmp);
-        }
-        else {
-            XMEMCPY(r, &ctx->t[0], sizeof(sp_point_256));
-        }
-        err = MP_OKAY;
-        break;
-    }
-
-    if (err == MP_OKAY && ctx->state != 7) {
-        err = FP_WOULDBLOCK;
-    }
-    if (err != FP_WOULDBLOCK) {
-        ForceZero(ctx->tmp, sizeof(ctx->tmp));
-        ForceZero(ctx->t, sizeof(ctx->t));
-    }
-
-    (void)heap;
-
-    return err;
-}
-
-#endif /* WOLFSSL_SP_NONBLOCK */
-
-static int sp_256_ecc_mulmod_10(sp_point_256* r, const sp_point_256* g,
-        const sp_digit* k, int map, int ct, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* t = NULL;
-    sp_digit* tmp = NULL;
-#else
-    sp_point_256 t[3];
-    sp_digit tmp[2 * 10 * 5];
-#endif
-    sp_digit n;
-    int i;
-    int c;
-    int y;
-    int err = MP_OKAY;
-
-    /* Implementation is constant time. */
-    (void)ct;
-    (void)heap;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 3, heap,
-                                     DYNAMIC_TYPE_ECC);
-    if (t == NULL)
-        err = MEMORY_E;
-    if (err == MP_OKAY) {
-        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 10 * 5, heap,
-                                 DYNAMIC_TYPE_ECC);
-        if (tmp == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        XMEMSET(t, 0, sizeof(sp_point_256) * 3);
-
-        /* t[0] = {0, 0, 1} * norm */
-        t[0].infinity = 1;
-        /* t[1] = {g->x, g->y, g->z} * norm */
-        err = sp_256_mod_mul_norm_10(t[1].x, g->x, p256_mod);
-    }
-    if (err == MP_OKAY)
-        err = sp_256_mod_mul_norm_10(t[1].y, g->y, p256_mod);
-    if (err == MP_OKAY)
-        err = sp_256_mod_mul_norm_10(t[1].z, g->z, p256_mod);
-
-    if (err == MP_OKAY) {
-        i = 9;
-        c = 22;
-        n = k[i--] << (26 - c);
-        for (; ; c--) {
-            if (c == 0) {
-                if (i == -1)
-                    break;
-
-                n = k[i--];
-                c = 26;
-            }
-
-            y = (n >> 25) & 1;
-            n <<= 1;
-
-            sp_256_proj_point_add_10(&t[y^1], &t[0], &t[1], tmp);
-
-            XMEMCPY(&t[2], (void*)(((size_t)&t[0] & addr_mask[y^1]) +
-                                   ((size_t)&t[1] & addr_mask[y])),
-                    sizeof(sp_point_256));
-            sp_256_proj_point_dbl_10(&t[2], &t[2], tmp);
-            XMEMCPY((void*)(((size_t)&t[0] & addr_mask[y^1]) +
-                            ((size_t)&t[1] & addr_mask[y])), &t[2],
-                    sizeof(sp_point_256));
-        }
-
-        if (map != 0) {
-            sp_256_map_10(r, &t[0], tmp);
-        }
-        else {
-            XMEMCPY(r, &t[0], sizeof(sp_point_256));
-        }
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (tmp != NULL)
-#endif
-    {
-        ForceZero(tmp, sizeof(sp_digit) * 2 * 10 * 5);
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
-    #endif
-    }
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (t != NULL)
-#endif
-    {
-        ForceZero(t, sizeof(sp_point_256) * 3);
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        XFREE(t, heap, DYNAMIC_TYPE_ECC);
-    #endif
-    }
-
-    return err;
-}
-
-#else
-/* A table entry for pre-computed points. */
-typedef struct sp_table_entry_256 {
-    sp_digit x[10];
-    sp_digit y[10];
-} sp_table_entry_256;
-
-/* Conditionally copy a into r using the mask m.
- * m is -1 to copy and 0 when not.
- *
- * r  A single precision number to copy over.
- * a  A single precision number to copy.
- * m  Mask value to apply.
- */
-static void sp_256_cond_copy_10(sp_digit* r, const sp_digit* a, const sp_digit m)
-{
-    sp_digit t[10];
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 10; i++) {
-        t[i] = r[i] ^ a[i];
-    }
-    for (i = 0; i < 10; i++) {
-        r[i] ^= t[i] & m;
-    }
-#else
-    t[ 0] = r[ 0] ^ a[ 0];
-    t[ 1] = r[ 1] ^ a[ 1];
-    t[ 2] = r[ 2] ^ a[ 2];
-    t[ 3] = r[ 3] ^ a[ 3];
-    t[ 4] = r[ 4] ^ a[ 4];
-    t[ 5] = r[ 5] ^ a[ 5];
-    t[ 6] = r[ 6] ^ a[ 6];
-    t[ 7] = r[ 7] ^ a[ 7];
-    t[ 8] = r[ 8] ^ a[ 8];
-    t[ 9] = r[ 9] ^ a[ 9];
-    r[ 0] ^= t[ 0] & m;
-    r[ 1] ^= t[ 1] & m;
-    r[ 2] ^= t[ 2] & m;
-    r[ 3] ^= t[ 3] & m;
-    r[ 4] ^= t[ 4] & m;
-    r[ 5] ^= t[ 5] & m;
-    r[ 6] ^= t[ 6] & m;
-    r[ 7] ^= t[ 7] & m;
-    r[ 8] ^= t[ 8] & m;
-    r[ 9] ^= t[ 9] & m;
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Double the Montgomery form projective point p a number of times.
- *
- * r  Result of repeated doubling of point.
- * p  Point to double.
- * n  Number of times to double
- * t  Temporary ordinate data.
- */
-static void sp_256_proj_point_dbl_n_10(sp_point_256* p, int n,
-    sp_digit* t)
-{
-    sp_digit* w = t;
-    sp_digit* a = t + 2*10;
-    sp_digit* b = t + 4*10;
-    sp_digit* t1 = t + 6*10;
-    sp_digit* t2 = t + 8*10;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = p->x;
-    y = p->y;
-    z = p->z;
-
-    /* Y = 2*Y */
-    sp_256_mont_dbl_10(y, y, p256_mod);
-    /* W = Z^4 */
-    sp_256_mont_sqr_10(w, z, p256_mod, p256_mp_mod);
-    sp_256_mont_sqr_10(w, w, p256_mod, p256_mp_mod);
-
-#ifndef WOLFSSL_SP_SMALL
-    while (--n > 0)
-#else
-    while (--n >= 0)
-#endif
-    {
-        /* A = 3*(X^2 - W) */
-        sp_256_mont_sqr_10(t1, x, p256_mod, p256_mp_mod);
-        sp_256_mont_sub_10(t1, t1, w, p256_mod);
-        sp_256_mont_tpl_10(a, t1, p256_mod);
-        /* B = X*Y^2 */
-        sp_256_mont_sqr_10(t1, y, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(b, t1, x, p256_mod, p256_mp_mod);
-        /* X = A^2 - 2B */
-        sp_256_mont_sqr_10(x, a, p256_mod, p256_mp_mod);
-        sp_256_mont_dbl_10(t2, b, p256_mod);
-        sp_256_mont_sub_10(x, x, t2, p256_mod);
-        /* Z = Z*Y */
-        sp_256_mont_mul_10(z, z, y, p256_mod, p256_mp_mod);
-        /* t2 = Y^4 */
-        sp_256_mont_sqr_10(t1, t1, p256_mod, p256_mp_mod);
-#ifdef WOLFSSL_SP_SMALL
-        if (n != 0)
-#endif
-        {
-            /* W = W*Y^4 */
-            sp_256_mont_mul_10(w, w, t1, p256_mod, p256_mp_mod);
-        }
-        /* y = 2*A*(B - X) - Y^4 */
-        sp_256_mont_sub_10(y, b, x, p256_mod);
-        sp_256_mont_mul_10(y, y, a, p256_mod, p256_mp_mod);
-        sp_256_mont_dbl_10(y, y, p256_mod);
-        sp_256_mont_sub_10(y, y, t1, p256_mod);
-    }
-#ifndef WOLFSSL_SP_SMALL
-    /* A = 3*(X^2 - W) */
-    sp_256_mont_sqr_10(t1, x, p256_mod, p256_mp_mod);
-    sp_256_mont_sub_10(t1, t1, w, p256_mod);
-    sp_256_mont_tpl_10(a, t1, p256_mod);
-    /* B = X*Y^2 */
-    sp_256_mont_sqr_10(t1, y, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(b, t1, x, p256_mod, p256_mp_mod);
-    /* X = A^2 - 2B */
-    sp_256_mont_sqr_10(x, a, p256_mod, p256_mp_mod);
-    sp_256_mont_dbl_10(t2, b, p256_mod);
-    sp_256_mont_sub_10(x, x, t2, p256_mod);
-    /* Z = Z*Y */
-    sp_256_mont_mul_10(z, z, y, p256_mod, p256_mp_mod);
-    /* t2 = Y^4 */
-    sp_256_mont_sqr_10(t1, t1, p256_mod, p256_mp_mod);
-    /* y = 2*A*(B - X) - Y^4 */
-    sp_256_mont_sub_10(y, b, x, p256_mod);
-    sp_256_mont_mul_10(y, y, a, p256_mod, p256_mp_mod);
-    sp_256_mont_dbl_10(y, y, p256_mod);
-    sp_256_mont_sub_10(y, y, t1, p256_mod);
-#endif
-    /* Y = Y/2 */
-    sp_256_div2_10(y, y, p256_mod);
-}
-
-/* Double the Montgomery form projective point p a number of times.
- *
- * r  Result of repeated doubling of point.
- * p  Point to double.
- * n  Number of times to double
- * t  Temporary ordinate data.
- */
-static void sp_256_proj_point_dbl_n_store_10(sp_point_256* r,
-        const sp_point_256* p, int n, int m, sp_digit* t)
-{
-    sp_digit* w = t;
-    sp_digit* a = t + 2*10;
-    sp_digit* b = t + 4*10;
-    sp_digit* t1 = t + 6*10;
-    sp_digit* t2 = t + 8*10;
-    sp_digit* x = r[2*m].x;
-    sp_digit* y = r[(1<<n)*m].y;
-    sp_digit* z = r[2*m].z;
-    int i;
-    int j;
-
-    for (i=0; i<10; i++) {
-        x[i] = p->x[i];
-    }
-    for (i=0; i<10; i++) {
-        y[i] = p->y[i];
-    }
-    for (i=0; i<10; i++) {
-        z[i] = p->z[i];
-    }
-
-    /* Y = 2*Y */
-    sp_256_mont_dbl_10(y, y, p256_mod);
-    /* W = Z^4 */
-    sp_256_mont_sqr_10(w, z, p256_mod, p256_mp_mod);
-    sp_256_mont_sqr_10(w, w, p256_mod, p256_mp_mod);
-    j = m;
-    for (i=1; i<=n; i++) {
-        j *= 2;
-
-        /* A = 3*(X^2 - W) */
-        sp_256_mont_sqr_10(t1, x, p256_mod, p256_mp_mod);
-        sp_256_mont_sub_10(t1, t1, w, p256_mod);
-        sp_256_mont_tpl_10(a, t1, p256_mod);
-        /* B = X*Y^2 */
-        sp_256_mont_sqr_10(t2, y, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(b, t2, x, p256_mod, p256_mp_mod);
-        x = r[j].x;
-        /* X = A^2 - 2B */
-        sp_256_mont_sqr_10(x, a, p256_mod, p256_mp_mod);
-        sp_256_mont_dbl_10(t1, b, p256_mod);
-        sp_256_mont_sub_10(x, x, t1, p256_mod);
-        /* Z = Z*Y */
-        sp_256_mont_mul_10(r[j].z, z, y, p256_mod, p256_mp_mod);
-        z = r[j].z;
-        /* t2 = Y^4 */
-        sp_256_mont_sqr_10(t2, t2, p256_mod, p256_mp_mod);
-        if (i != n) {
-            /* W = W*Y^4 */
-            sp_256_mont_mul_10(w, w, t2, p256_mod, p256_mp_mod);
-        }
-        /* y = 2*A*(B - X) - Y^4 */
-        sp_256_mont_sub_10(y, b, x, p256_mod);
-        sp_256_mont_mul_10(y, y, a, p256_mod, p256_mp_mod);
-        sp_256_mont_dbl_10(y, y, p256_mod);
-        sp_256_mont_sub_10(y, y, t2, p256_mod);
-
-        /* Y = Y/2 */
-        sp_256_div2_10(r[j].y, y, p256_mod);
-        r[j].infinity = 0;
-    }
-}
-
-/* Add two Montgomery form projective points.
- *
- * ra  Result of addition.
- * rs  Result of subtraction.
- * p   First point to add.
- * q   Second point to add.
- * t   Temporary ordinate data.
- */
-static void sp_256_proj_point_add_sub_10(sp_point_256* ra,
-        sp_point_256* rs, const sp_point_256* p, const sp_point_256* q,
-        sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*10;
-    sp_digit* t3 = t + 4*10;
-    sp_digit* t4 = t + 6*10;
-    sp_digit* t5 = t + 8*10;
-    sp_digit* t6 = t + 10*10;
-    sp_digit* x = ra->x;
-    sp_digit* y = ra->y;
-    sp_digit* z = ra->z;
-    sp_digit* xs = rs->x;
-    sp_digit* ys = rs->y;
-    sp_digit* zs = rs->z;
-
-
-    XMEMCPY(x, p->x, sizeof(p->x) / 2);
-    XMEMCPY(y, p->y, sizeof(p->y) / 2);
-    XMEMCPY(z, p->z, sizeof(p->z) / 2);
-    ra->infinity = 0;
-    rs->infinity = 0;
-
-    /* U1 = X1*Z2^2 */
-    sp_256_mont_sqr_10(t1, q->z, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(t3, t1, q->z, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(t1, t1, x, p256_mod, p256_mp_mod);
-    /* U2 = X2*Z1^2 */
-    sp_256_mont_sqr_10(t2, z, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(t4, t2, z, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(t2, t2, q->x, p256_mod, p256_mp_mod);
-    /* S1 = Y1*Z2^3 */
-    sp_256_mont_mul_10(t3, t3, y, p256_mod, p256_mp_mod);
-    /* S2 = Y2*Z1^3 */
-    sp_256_mont_mul_10(t4, t4, q->y, p256_mod, p256_mp_mod);
-    /* H = U2 - U1 */
-    sp_256_mont_sub_10(t2, t2, t1, p256_mod);
-    /* RS = S2 + S1 */
-    sp_256_mont_add_10(t6, t4, t3, p256_mod);
-    /* R = S2 - S1 */
-    sp_256_mont_sub_10(t4, t4, t3, p256_mod);
-    /* Z3 = H*Z1*Z2 */
-    /* ZS = H*Z1*Z2 */
-    sp_256_mont_mul_10(z, z, q->z, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(z, z, t2, p256_mod, p256_mp_mod);
-    XMEMCPY(zs, z, sizeof(p->z)/2);
-    /* X3 = R^2 - H^3 - 2*U1*H^2 */
-    /* XS = RS^2 - H^3 - 2*U1*H^2 */
-    sp_256_mont_sqr_10(x, t4, p256_mod, p256_mp_mod);
-    sp_256_mont_sqr_10(xs, t6, p256_mod, p256_mp_mod);
-    sp_256_mont_sqr_10(t5, t2, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(y, t1, t5, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(t5, t5, t2, p256_mod, p256_mp_mod);
-    sp_256_mont_sub_10(x, x, t5, p256_mod);
-    sp_256_mont_sub_10(xs, xs, t5, p256_mod);
-    sp_256_mont_dbl_10(t1, y, p256_mod);
-    sp_256_mont_sub_10(x, x, t1, p256_mod);
-    sp_256_mont_sub_10(xs, xs, t1, p256_mod);
-    /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-    /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */
-    sp_256_mont_sub_10(ys, y, xs, p256_mod);
-    sp_256_mont_sub_10(y, y, x, p256_mod);
-    sp_256_mont_mul_10(y, y, t4, p256_mod, p256_mp_mod);
-    sp_256_sub_10(t6, p256_mod, t6);
-    sp_256_mont_mul_10(ys, ys, t6, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(t5, t5, t3, p256_mod, p256_mp_mod);
-    sp_256_mont_sub_10(y, y, t5, p256_mod);
-    sp_256_mont_sub_10(ys, ys, t5, p256_mod);
-}
-
-/* Structure used to describe recoding of scalar multiplication. */
-typedef struct ecc_recode_256 {
-    /* Index into pre-computation table. */
-    uint8_t i;
-    /* Use the negative of the point. */
-    uint8_t neg;
-} ecc_recode_256;
-
-/* The index into pre-computation table to use. */
-static const uint8_t recode_index_10_6[66] = {
-     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
-     0,  1,
-};
-
-/* Whether to negate y-ordinate. */
-static const uint8_t recode_neg_10_6[66] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     0,  0,
-};
-
-/* Recode the scalar for multiplication using pre-computed values and
- * subtraction.
- *
- * k  Scalar to multiply by.
- * v  Vector of operations to perform.
- */
-static void sp_256_ecc_recode_6_10(const sp_digit* k, ecc_recode_256* v)
-{
-    int i;
-    int j;
-    uint8_t y;
-    int carry = 0;
-    int o;
-    sp_digit n;
-
-    j = 0;
-    n = k[j];
-    o = 0;
-    for (i=0; i<43; i++) {
-        y = (int8_t)n;
-        if (o + 6 < 26) {
-            y &= 0x3f;
-            n >>= 6;
-            o += 6;
-        }
-        else if (o + 6 == 26) {
-            n >>= 6;
-            if (++j < 10)
-                n = k[j];
-            o = 0;
-        }
-        else if (++j < 10) {
-            n = k[j];
-            y |= (uint8_t)((n << (26 - o)) & 0x3f);
-            o -= 20;
-            n >>= o;
-        }
-
-        y += (uint8_t)carry;
-        v[i].i = recode_index_10_6[y];
-        v[i].neg = recode_neg_10_6[y];
-        carry = (y >> 6) + v[i].neg;
-    }
-}
-
-#ifndef WC_NO_CACHE_RESISTANT
-/* Touch each possible point that could be being copied.
- *
- * r      Point to copy into.
- * table  Table - start of the entires to access
- * idx    Index of entry to retrieve.
- */
-static void sp_256_get_point_33_10(sp_point_256* r, const sp_point_256* table,
-    int idx)
-{
-    int i;
-    sp_digit mask;
-
-    r->x[0] = 0;
-    r->x[1] = 0;
-    r->x[2] = 0;
-    r->x[3] = 0;
-    r->x[4] = 0;
-    r->x[5] = 0;
-    r->x[6] = 0;
-    r->x[7] = 0;
-    r->x[8] = 0;
-    r->x[9] = 0;
-    r->y[0] = 0;
-    r->y[1] = 0;
-    r->y[2] = 0;
-    r->y[3] = 0;
-    r->y[4] = 0;
-    r->y[5] = 0;
-    r->y[6] = 0;
-    r->y[7] = 0;
-    r->y[8] = 0;
-    r->y[9] = 0;
-    r->z[0] = 0;
-    r->z[1] = 0;
-    r->z[2] = 0;
-    r->z[3] = 0;
-    r->z[4] = 0;
-    r->z[5] = 0;
-    r->z[6] = 0;
-    r->z[7] = 0;
-    r->z[8] = 0;
-    r->z[9] = 0;
-    for (i = 1; i < 33; i++) {
-        mask = 0 - (i == idx);
-        r->x[0] |= mask & table[i].x[0];
-        r->x[1] |= mask & table[i].x[1];
-        r->x[2] |= mask & table[i].x[2];
-        r->x[3] |= mask & table[i].x[3];
-        r->x[4] |= mask & table[i].x[4];
-        r->x[5] |= mask & table[i].x[5];
-        r->x[6] |= mask & table[i].x[6];
-        r->x[7] |= mask & table[i].x[7];
-        r->x[8] |= mask & table[i].x[8];
-        r->x[9] |= mask & table[i].x[9];
-        r->y[0] |= mask & table[i].y[0];
-        r->y[1] |= mask & table[i].y[1];
-        r->y[2] |= mask & table[i].y[2];
-        r->y[3] |= mask & table[i].y[3];
-        r->y[4] |= mask & table[i].y[4];
-        r->y[5] |= mask & table[i].y[5];
-        r->y[6] |= mask & table[i].y[6];
-        r->y[7] |= mask & table[i].y[7];
-        r->y[8] |= mask & table[i].y[8];
-        r->y[9] |= mask & table[i].y[9];
-        r->z[0] |= mask & table[i].z[0];
-        r->z[1] |= mask & table[i].z[1];
-        r->z[2] |= mask & table[i].z[2];
-        r->z[3] |= mask & table[i].z[3];
-        r->z[4] |= mask & table[i].z[4];
-        r->z[5] |= mask & table[i].z[5];
-        r->z[6] |= mask & table[i].z[6];
-        r->z[7] |= mask & table[i].z[7];
-        r->z[8] |= mask & table[i].z[8];
-        r->z[9] |= mask & table[i].z[9];
-    }
-}
-#endif /* !WC_NO_CACHE_RESISTANT */
-/* Multiply the point by the scalar and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * Window technique of 6 bits. (Add-Sub variation.)
- * Calculate 0..32 times the point. Use function that adds and
- * subtracts the same two points.
- * Recode to add or subtract one of the computed points.
- * Double to push up.
- * NOT a sliding window.
- *
- * r     Resulting point.
- * g     Point to multiply.
- * k     Scalar to multiply by.
- * map   Indicates whether to convert result to affine.
- * ct    Constant time required.
- * heap  Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-static int sp_256_ecc_mulmod_win_add_sub_10(sp_point_256* r, const sp_point_256* g,
-        const sp_digit* k, int map, int ct, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* t = NULL;
-    sp_digit* tmp = NULL;
-#else
-    sp_point_256 t[33+2];
-    sp_digit tmp[2 * 10 * 6];
-#endif
-    sp_point_256* rt = NULL;
-    sp_point_256* p = NULL;
-    sp_digit* negy;
-    int i;
-    ecc_recode_256 v[43];
-    int err = MP_OKAY;
-
-    /* Constant time used for cache attack resistance implementation. */
-    (void)ct;
-    (void)heap;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 
-        (33+2), heap, DYNAMIC_TYPE_ECC);
-    if (t == NULL)
-        err = MEMORY_E;
-    if (err == MP_OKAY) {
-        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 10 * 6,
-                                 heap, DYNAMIC_TYPE_ECC);
-        if (tmp == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        rt = t + 33;
-        p  = t + 33+1;
-
-        /* t[0] = {0, 0, 1} * norm */
-        XMEMSET(&t[0], 0, sizeof(t[0]));
-        t[0].infinity = 1;
-        /* t[1] = {g->x, g->y, g->z} * norm */
-        err = sp_256_mod_mul_norm_10(t[1].x, g->x, p256_mod);
-    }
-    if (err == MP_OKAY) {
-        err = sp_256_mod_mul_norm_10(t[1].y, g->y, p256_mod);
-    }
-    if (err == MP_OKAY) {
-        err = sp_256_mod_mul_norm_10(t[1].z, g->z, p256_mod);
-    }
-
-    if (err == MP_OKAY) {
-        t[1].infinity = 0;
-        /* t[2] ... t[32]  */
-        sp_256_proj_point_dbl_n_store_10(t, &t[ 1], 5, 1, tmp);
-        sp_256_proj_point_add_10(&t[ 3], &t[ 2], &t[ 1], tmp);
-        sp_256_proj_point_dbl_10(&t[ 6], &t[ 3], tmp);
-        sp_256_proj_point_add_sub_10(&t[ 7], &t[ 5], &t[ 6], &t[ 1], tmp);
-        sp_256_proj_point_dbl_10(&t[10], &t[ 5], tmp);
-        sp_256_proj_point_add_sub_10(&t[11], &t[ 9], &t[10], &t[ 1], tmp);
-        sp_256_proj_point_dbl_10(&t[12], &t[ 6], tmp);
-        sp_256_proj_point_dbl_10(&t[14], &t[ 7], tmp);
-        sp_256_proj_point_add_sub_10(&t[15], &t[13], &t[14], &t[ 1], tmp);
-        sp_256_proj_point_dbl_10(&t[18], &t[ 9], tmp);
-        sp_256_proj_point_add_sub_10(&t[19], &t[17], &t[18], &t[ 1], tmp);
-        sp_256_proj_point_dbl_10(&t[20], &t[10], tmp);
-        sp_256_proj_point_dbl_10(&t[22], &t[11], tmp);
-        sp_256_proj_point_add_sub_10(&t[23], &t[21], &t[22], &t[ 1], tmp);
-        sp_256_proj_point_dbl_10(&t[24], &t[12], tmp);
-        sp_256_proj_point_dbl_10(&t[26], &t[13], tmp);
-        sp_256_proj_point_add_sub_10(&t[27], &t[25], &t[26], &t[ 1], tmp);
-        sp_256_proj_point_dbl_10(&t[28], &t[14], tmp);
-        sp_256_proj_point_dbl_10(&t[30], &t[15], tmp);
-        sp_256_proj_point_add_sub_10(&t[31], &t[29], &t[30], &t[ 1], tmp);
-
-        negy = t[0].y;
-
-        sp_256_ecc_recode_6_10(k, v);
-
-        i = 42;
-    #ifndef WC_NO_CACHE_RESISTANT
-        if (ct) {
-            sp_256_get_point_33_10(rt, t, v[i].i);
-            rt->infinity = !v[i].i;
-        }
-        else
-    #endif
-        {
-            XMEMCPY(rt, &t[v[i].i], sizeof(sp_point_256));
-        }
-        for (--i; i>=0; i--) {
-            sp_256_proj_point_dbl_n_10(rt, 6, tmp);
-
-        #ifndef WC_NO_CACHE_RESISTANT
-            if (ct) {
-                sp_256_get_point_33_10(p, t, v[i].i);
-                p->infinity = !v[i].i;
-            }
-            else
-        #endif
-            {
-                XMEMCPY(p, &t[v[i].i], sizeof(sp_point_256));
-            }
-            sp_256_sub_10(negy, p256_mod, p->y);
-            sp_256_norm_10(negy);
-            sp_256_cond_copy_10(p->y, negy, (sp_digit)0 - v[i].neg);
-            sp_256_proj_point_add_10(rt, rt, p, tmp);
-        }
-
-        if (map != 0) {
-            sp_256_map_10(r, rt, tmp);
-        }
-        else {
-            XMEMCPY(r, rt, sizeof(sp_point_256));
-        }
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (t != NULL)
-        XFREE(t, heap, DYNAMIC_TYPE_ECC);
-    if (tmp != NULL)
-        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
-}
-
-#ifdef FP_ECC
-#endif /* FP_ECC */
-/* Add two Montgomery form projective points. The second point has a q value of
- * one.
- * Only the first point can be the same pointer as the result point.
- *
- * r  Result of addition.
- * p  First point to add.
- * q  Second point to add.
- * t  Temporary ordinate data.
- */
-static void sp_256_proj_point_add_qz1_10(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
-{
-    const sp_point_256* ap[2];
-    sp_point_256* rp[2];
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*10;
-    sp_digit* t3 = t + 4*10;
-    sp_digit* t4 = t + 6*10;
-    sp_digit* t5 = t + 8*10;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-    int i;
-
-    /* Check double */
-    (void)sp_256_sub_10(t1, p256_mod, q->y);
-    sp_256_norm_10(t1);
-    if ((sp_256_cmp_equal_10(p->x, q->x) & sp_256_cmp_equal_10(p->z, q->z) &
-        (sp_256_cmp_equal_10(p->y, q->y) | sp_256_cmp_equal_10(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_10(r, p, t);
-    }
-    else {
-        rp[0] = r;
-
-        /*lint allow cast to different type of pointer*/
-        rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/
-        XMEMSET(rp[1], 0, sizeof(sp_point_256));
-        x = rp[p->infinity | q->infinity]->x;
-        y = rp[p->infinity | q->infinity]->y;
-        z = rp[p->infinity | q->infinity]->z;
-
-        ap[0] = p;
-        ap[1] = q;
-        for (i=0; i<10; i++) {
-            r->x[i] = ap[p->infinity]->x[i];
-        }
-        for (i=0; i<10; i++) {
-            r->y[i] = ap[p->infinity]->y[i];
-        }
-        for (i=0; i<10; i++) {
-            r->z[i] = ap[p->infinity]->z[i];
-        }
-        r->infinity = ap[p->infinity]->infinity;
-
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_10(t2, z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t4, t2, z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_10(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - X1 */
-        sp_256_mont_sub_10(t2, t2, x, p256_mod);
-        /* R = S2 - Y1 */
-        sp_256_mont_sub_10(t4, t4, y, p256_mod);
-        /* Z3 = H*Z1 */
-        sp_256_mont_mul_10(z, z, t2, p256_mod, p256_mp_mod);
-        /* X3 = R^2 - H^3 - 2*X1*H^2 */
-        sp_256_mont_sqr_10(t1, t4, p256_mod, p256_mp_mod);
-        sp_256_mont_sqr_10(t5, t2, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t3, x, t5, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t5, t5, t2, p256_mod, p256_mp_mod);
-        sp_256_mont_sub_10(x, t1, t5, p256_mod);
-        sp_256_mont_dbl_10(t1, t3, p256_mod);
-        sp_256_mont_sub_10(x, x, t1, p256_mod);
-        /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
-        sp_256_mont_sub_10(t3, t3, x, p256_mod);
-        sp_256_mont_mul_10(t3, t3, t4, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(t5, t5, y, p256_mod, p256_mp_mod);
-        sp_256_mont_sub_10(y, t3, t5, p256_mod);
-    }
-}
-
-#ifdef FP_ECC
-/* Convert the projective point to affine.
- * Ordinates are in Montgomery form.
- *
- * a  Point to convert.
- * t  Temporary data.
- */
-static void sp_256_proj_to_affine_10(sp_point_256* a, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2 * 10;
-    sp_digit* tmp = t + 4 * 10;
-
-    sp_256_mont_inv_10(t1, a->z, tmp);
-
-    sp_256_mont_sqr_10(t2, t1, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(t1, t2, t1, p256_mod, p256_mp_mod);
-
-    sp_256_mont_mul_10(a->x, a->x, t2, p256_mod, p256_mp_mod);
-    sp_256_mont_mul_10(a->y, a->y, t1, p256_mod, p256_mp_mod);
-    XMEMCPY(a->z, p256_norm_mod, sizeof(p256_norm_mod));
-}
-
-/* Generate the pre-computed table of points for the base point.
- *
- * width = 8
- * 256 entries
- * 32 bits between
- *
- * a      The base point.
- * table  Place to store generated point data.
- * tmp    Temporary data.
- * heap  Heap to use for allocation.
- */
-static int sp_256_gen_stripe_table_10(const sp_point_256* a,
-        sp_table_entry_256* table, sp_digit* tmp, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* t = NULL;
-#else
-    sp_point_256 t[3];
-#endif
-    sp_point_256* s1 = NULL;
-    sp_point_256* s2 = NULL;
-    int i;
-    int j;
-    int err = MP_OKAY;
-
-    (void)heap;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 3, heap,
-                                     DYNAMIC_TYPE_ECC);
-    if (t == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        s1 = t + 1;
-        s2 = t + 2;
-
-        err = sp_256_mod_mul_norm_10(t->x, a->x, p256_mod);
-    }
-    if (err == MP_OKAY) {
-        err = sp_256_mod_mul_norm_10(t->y, a->y, p256_mod);
-    }
-    if (err == MP_OKAY) {
-        err = sp_256_mod_mul_norm_10(t->z, a->z, p256_mod);
-    }
-    if (err == MP_OKAY) {
-        t->infinity = 0;
-        sp_256_proj_to_affine_10(t, tmp);
-
-        XMEMCPY(s1->z, p256_norm_mod, sizeof(p256_norm_mod));
-        s1->infinity = 0;
-        XMEMCPY(s2->z, p256_norm_mod, sizeof(p256_norm_mod));
-        s2->infinity = 0;
-
-        /* table[0] = {0, 0, infinity} */
-        XMEMSET(&table[0], 0, sizeof(sp_table_entry_256));
-        /* table[1] = Affine version of 'a' in Montgomery form */
-        XMEMCPY(table[1].x, t->x, sizeof(table->x));
-        XMEMCPY(table[1].y, t->y, sizeof(table->y));
-
-        for (i=1; i<8; i++) {
-            sp_256_proj_point_dbl_n_10(t, 32, tmp);
-            sp_256_proj_to_affine_10(t, tmp);
-            XMEMCPY(table[1<<i].x, t->x, sizeof(table->x));
-            XMEMCPY(table[1<<i].y, t->y, sizeof(table->y));
-        }
-
-        for (i=1; i<8; i++) {
-            XMEMCPY(s1->x, table[1<<i].x, sizeof(table->x));
-            XMEMCPY(s1->y, table[1<<i].y, sizeof(table->y));
-            for (j=(1<<i)+1; j<(1<<(i+1)); j++) {
-                XMEMCPY(s2->x, table[j-(1<<i)].x, sizeof(table->x));
-                XMEMCPY(s2->y, table[j-(1<<i)].y, sizeof(table->y));
-                sp_256_proj_point_add_qz1_10(t, s1, s2, tmp);
-                sp_256_proj_to_affine_10(t, tmp);
-                XMEMCPY(table[j].x, t->x, sizeof(table->x));
-                XMEMCPY(table[j].y, t->y, sizeof(table->y));
-            }
-        }
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (t != NULL)
-        XFREE(t, heap, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
-}
-
-#endif /* FP_ECC */
-#ifndef WC_NO_CACHE_RESISTANT
-/* Touch each possible entry that could be being copied.
- *
- * r      Point to copy into.
- * table  Table - start of the entires to access
- * idx    Index of entry to retrieve.
- */
-static void sp_256_get_entry_256_10(sp_point_256* r,
-    const sp_table_entry_256* table, int idx)
-{
-    int i;
-    sp_digit mask;
-
-    r->x[0] = 0;
-    r->x[1] = 0;
-    r->x[2] = 0;
-    r->x[3] = 0;
-    r->x[4] = 0;
-    r->x[5] = 0;
-    r->x[6] = 0;
-    r->x[7] = 0;
-    r->x[8] = 0;
-    r->x[9] = 0;
-    r->y[0] = 0;
-    r->y[1] = 0;
-    r->y[2] = 0;
-    r->y[3] = 0;
-    r->y[4] = 0;
-    r->y[5] = 0;
-    r->y[6] = 0;
-    r->y[7] = 0;
-    r->y[8] = 0;
-    r->y[9] = 0;
-    for (i = 1; i < 256; i++) {
-        mask = 0 - (i == idx);
-        r->x[0] |= mask & table[i].x[0];
-        r->x[1] |= mask & table[i].x[1];
-        r->x[2] |= mask & table[i].x[2];
-        r->x[3] |= mask & table[i].x[3];
-        r->x[4] |= mask & table[i].x[4];
-        r->x[5] |= mask & table[i].x[5];
-        r->x[6] |= mask & table[i].x[6];
-        r->x[7] |= mask & table[i].x[7];
-        r->x[8] |= mask & table[i].x[8];
-        r->x[9] |= mask & table[i].x[9];
-        r->y[0] |= mask & table[i].y[0];
-        r->y[1] |= mask & table[i].y[1];
-        r->y[2] |= mask & table[i].y[2];
-        r->y[3] |= mask & table[i].y[3];
-        r->y[4] |= mask & table[i].y[4];
-        r->y[5] |= mask & table[i].y[5];
-        r->y[6] |= mask & table[i].y[6];
-        r->y[7] |= mask & table[i].y[7];
-        r->y[8] |= mask & table[i].y[8];
-        r->y[9] |= mask & table[i].y[9];
-    }
-}
-#endif /* !WC_NO_CACHE_RESISTANT */
-/* Multiply the point by the scalar and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * Stripe implementation.
- * Pre-generated: 2^0, 2^32, ...
- * Pre-generated: products of all combinations of above.
- * 8 doubles and adds (with qz=1)
- *
- * r      Resulting point.
- * k      Scalar to multiply by.
- * table  Pre-computed table.
- * map    Indicates whether to convert result to affine.
- * ct     Constant time required.
- * heap   Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-static int sp_256_ecc_mulmod_stripe_10(sp_point_256* r, const sp_point_256* g,
-        const sp_table_entry_256* table, const sp_digit* k, int map,
-        int ct, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* rt = NULL;
-    sp_digit* t = NULL;
-#else
-    sp_point_256 rt[2];
-    sp_digit t[2 * 10 * 5];
-#endif
-    sp_point_256* p = NULL;
-    int i;
-    int j;
-    int y;
-    int x;
-    int err = MP_OKAY;
-
-    (void)g;
-    /* Constant time used for cache attack resistance implementation. */
-    (void)ct;
-    (void)heap;
-
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    rt = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
-                                      DYNAMIC_TYPE_ECC);
-    if (rt == NULL)
-        err = MEMORY_E;
-    if (err == MP_OKAY) {
-        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 10 * 5, heap,
-                               DYNAMIC_TYPE_ECC);
-        if (t == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        p = rt + 1;
-
-        XMEMCPY(p->z, p256_norm_mod, sizeof(p256_norm_mod));
-        XMEMCPY(rt->z, p256_norm_mod, sizeof(p256_norm_mod));
-
-        y = 0;
-        x = 31;
-        for (j=0; j<8; j++) {
-            y |= (int)(((k[x / 26] >> (x % 26)) & 1) << j);
-            x += 32;
-        }
-    #ifndef WC_NO_CACHE_RESISTANT
-        if (ct) {
-            sp_256_get_entry_256_10(rt, table, y);
-        } else
-    #endif
-        {
-            XMEMCPY(rt->x, table[y].x, sizeof(table[y].x));
-            XMEMCPY(rt->y, table[y].y, sizeof(table[y].y));
-        }
-        rt->infinity = !y;
-        for (i=30; i>=0; i--) {
-            y = 0;
-            x = i;
-            for (j=0; j<8; j++) {
-                y |= (int)(((k[x / 26] >> (x % 26)) & 1) << j);
-                x += 32;
-            }
-
-            sp_256_proj_point_dbl_10(rt, rt, t);
-        #ifndef WC_NO_CACHE_RESISTANT
-            if (ct) {
-                sp_256_get_entry_256_10(p, table, y);
-            }
-            else
-        #endif
-            {
-                XMEMCPY(p->x, table[y].x, sizeof(table[y].x));
-                XMEMCPY(p->y, table[y].y, sizeof(table[y].y));
-            }
-            p->infinity = !y;
-            sp_256_proj_point_add_qz1_10(rt, rt, p, t);
-        }
-
-        if (map != 0) {
-            sp_256_map_10(r, rt, t);
-        }
-        else {
-            XMEMCPY(r, rt, sizeof(sp_point_256));
-        }
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (t != NULL)
-        XFREE(t, heap, DYNAMIC_TYPE_ECC);
-    if (rt != NULL)
-        XFREE(rt, heap, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
-}
-
-#ifdef FP_ECC
-#ifndef FP_ENTRIES
-    #define FP_ENTRIES 16
-#endif
-
-/* Cache entry - holds precomputation tables for a point. */
-typedef struct sp_cache_256_t {
-    /* X ordinate of point that table was generated from. */
-    sp_digit x[10];
-    /* Y ordinate of point that table was generated from. */
-    sp_digit y[10];
-    /* Precomputation table for point. */
-    sp_table_entry_256 table[256];
-    /* Count of entries in table. */
-    uint32_t cnt;
-    /* Point and table set in entry. */
-    int set;
-} sp_cache_256_t;
-
-/* Cache of tables. */
-static THREAD_LS_T sp_cache_256_t sp_cache_256[FP_ENTRIES];
-/* Index of last entry in cache. */
-static THREAD_LS_T int sp_cache_256_last = -1;
-/* Cache has been initialized. */
-static THREAD_LS_T int sp_cache_256_inited = 0;
-
-#ifndef HAVE_THREAD_LS
-    static volatile int initCacheMutex_256 = 0;
-    static wolfSSL_Mutex sp_cache_256_lock;
-#endif
-
-/* Get the cache entry for the point.
- *
- * g      [in]   Point scalar multipling.
- * cache  [out]  Cache table to use.
- */
-static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
-{
-    int i;
-    int j;
-    uint32_t least;
-
-    if (sp_cache_256_inited == 0) {
-        for (i=0; i<FP_ENTRIES; i++) {
-            sp_cache_256[i].set = 0;
-        }
-        sp_cache_256_inited = 1;
-    }
-
-    /* Compare point with those in cache. */
-    for (i=0; i<FP_ENTRIES; i++) {
-        if (!sp_cache_256[i].set)
-            continue;
-
-        if (sp_256_cmp_equal_10(g->x, sp_cache_256[i].x) &
-                           sp_256_cmp_equal_10(g->y, sp_cache_256[i].y)) {
-            sp_cache_256[i].cnt++;
-            break;
-        }
-    }
-
-    /* No match. */
-    if (i == FP_ENTRIES) {
-        /* Find empty entry. */
-        i = (sp_cache_256_last + 1) % FP_ENTRIES;
-        for (; i != sp_cache_256_last; i=(i+1)%FP_ENTRIES) {
-            if (!sp_cache_256[i].set) {
-                break;
-            }
-        }
-
-        /* Evict least used. */
-        if (i == sp_cache_256_last) {
-            least = sp_cache_256[0].cnt;
-            for (j=1; j<FP_ENTRIES; j++) {
-                if (sp_cache_256[j].cnt < least) {
-                    i = j;
-                    least = sp_cache_256[i].cnt;
-                }
-            }
-        }
-
-        XMEMCPY(sp_cache_256[i].x, g->x, sizeof(sp_cache_256[i].x));
-        XMEMCPY(sp_cache_256[i].y, g->y, sizeof(sp_cache_256[i].y));
-        sp_cache_256[i].set = 1;
-        sp_cache_256[i].cnt = 1;
-    }
-
-    *cache = &sp_cache_256[i];
-    sp_cache_256_last = i;
-}
-#endif /* FP_ECC */
-
-/* Multiply the base point of P256 by the scalar and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * r     Resulting point.
- * g     Point to multiply.
- * k     Scalar to multiply by.
- * map   Indicates whether to convert result to affine.
- * ct    Constant time required.
- * heap  Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-static int sp_256_ecc_mulmod_10(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
-        int map, int ct, void* heap)
-{
-#ifndef FP_ECC
-    return sp_256_ecc_mulmod_win_add_sub_10(r, g, k, map, ct, heap);
-#else
-    sp_digit tmp[2 * 10 * 5];
-    sp_cache_256_t* cache;
-    int err = MP_OKAY;
-
-#ifndef HAVE_THREAD_LS
-    if (initCacheMutex_256 == 0) {
-         wc_InitMutex(&sp_cache_256_lock);
-         initCacheMutex_256 = 1;
-    }
-    if (wc_LockMutex(&sp_cache_256_lock) != 0)
-       err = BAD_MUTEX_E;
-#endif /* HAVE_THREAD_LS */
-
-    if (err == MP_OKAY) {
-        sp_ecc_get_cache_256(g, &cache);
-        if (cache->cnt == 2)
-            sp_256_gen_stripe_table_10(g, cache->table, tmp, heap);
-
-#ifndef HAVE_THREAD_LS
-        wc_UnLockMutex(&sp_cache_256_lock);
-#endif /* HAVE_THREAD_LS */
-
-        if (cache->cnt < 2) {
-            err = sp_256_ecc_mulmod_win_add_sub_10(r, g, k, map, ct, heap);
-        }
-        else {
-            err = sp_256_ecc_mulmod_stripe_10(r, g, cache->table, k,
-                    map, ct, heap);
-        }
-    }
-
-    return err;
-#endif
-}
-
-#endif
-/* Multiply the point by the scalar and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * km    Scalar to multiply by.
- * p     Point to multiply.
- * r     Resulting point.
- * map   Indicates whether to convert result to affine.
- * heap  Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-int sp_ecc_mulmod_256(const mp_int* km, const ecc_point* gm, ecc_point* r,
-        int map, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* point = NULL;
-    sp_digit* k = NULL;
-#else
-    sp_point_256 point[1];
-    sp_digit k[10];
-#endif
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
-                                         DYNAMIC_TYPE_ECC);
-    if (point == NULL)
-        err = MEMORY_E;
-    if (err == MP_OKAY) {
-        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 10, heap,
-                               DYNAMIC_TYPE_ECC);
-        if (k == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        sp_256_from_mp(k, 10, km);
-        sp_256_point_from_ecc_point_10(point, gm);
-
-            err = sp_256_ecc_mulmod_10(point, point, k, map, 1, heap);
-    }
-    if (err == MP_OKAY) {
-        err = sp_256_point_to_ecc_point_10(point, r);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (k != NULL)
-        XFREE(k, heap, DYNAMIC_TYPE_ECC);
-    if (point != NULL)
-        XFREE(point, heap, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
-}
-
-/* Multiply the point by the scalar, add point a and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * km      Scalar to multiply by.
- * p       Point to multiply.
- * am      Point to add to scalar mulitply result.
- * inMont  Point to add is in montogmery form.
- * r       Resulting point.
- * map     Indicates whether to convert result to affine.
- * heap    Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm,
-    const ecc_point* am, int inMont, ecc_point* r, int map, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* point = NULL;    
-    sp_digit* k = NULL;
-#else
-    sp_point_256 point[2];
-    sp_digit k[10 + 10 * 2 * 5];
-#endif
-    sp_point_256* addP = NULL;
-    sp_digit* tmp = NULL;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
-                                         DYNAMIC_TYPE_ECC);
-    if (point == NULL)
-        err = MEMORY_E;
-    if (err == MP_OKAY) {
-        k = (sp_digit*)XMALLOC(
-            sizeof(sp_digit) * (10 + 10 * 2 * 5), heap,
-            DYNAMIC_TYPE_ECC);
-        if (k == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        addP = point + 1;
-        tmp = k + 10;
-
-        sp_256_from_mp(k, 10, km);
-        sp_256_point_from_ecc_point_10(point, gm);
-        sp_256_point_from_ecc_point_10(addP, am);
-    }
-    if ((err == MP_OKAY) && (!inMont)) {
-        err = sp_256_mod_mul_norm_10(addP->x, addP->x, p256_mod);
-    }
-    if ((err == MP_OKAY) && (!inMont)) {
-        err = sp_256_mod_mul_norm_10(addP->y, addP->y, p256_mod);
-    }
-    if ((err == MP_OKAY) && (!inMont)) {
-        err = sp_256_mod_mul_norm_10(addP->z, addP->z, p256_mod);
-    }
-    if (err == MP_OKAY) {
-            err = sp_256_ecc_mulmod_10(point, point, k, 0, 0, heap);
-    }
-    if (err == MP_OKAY) {
-            sp_256_proj_point_add_10(point, point, addP, tmp);
-
-        if (map) {
-                sp_256_map_10(point, point, tmp);
-        }
-
-        err = sp_256_point_to_ecc_point_10(point, r);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (k != NULL)
-        XFREE(k, heap, DYNAMIC_TYPE_ECC);
-    if (point != NULL)
-        XFREE(point, heap, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
-}
-
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply the base point of P256 by the scalar and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * r     Resulting point.
- * k     Scalar to multiply by.
- * map   Indicates whether to convert result to affine.
- * heap  Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-static int sp_256_ecc_mulmod_base_10(sp_point_256* r, const sp_digit* k,
-        int map, int ct, void* heap)
-{
-    /* No pre-computed values. */
-    return sp_256_ecc_mulmod_10(r, &p256_base, k, map, ct, heap);
-}
-
-#else
-/* Striping precomputation table.
- * 8 points combined into a table of 256 points.
- * Distance of 32 between points.
- */
-static const sp_table_entry_256 p256_table[256] = {
-    /* 0 */
-    { { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } },
-    /* 1 */
-    { { 0x0a9143c,0x1cc3506,0x360179e,0x3f17fb6,0x075ba95,0x1d88944,
-        0x3b732b7,0x15719e7,0x376a537,0x0062417 },
-      { 0x295560a,0x094d5f3,0x245cddf,0x392e867,0x18b4ab8,0x3487cc9,
-        0x288688d,0x176174b,0x3182588,0x0215c7f } },
-    /* 2 */
-    { { 0x147519a,0x2218090,0x32f0202,0x2b09acd,0x0d0981e,0x1e17af2,
-        0x14a7caa,0x163a6a7,0x10ddbdf,0x03654f1 },
-      { 0x1590f8f,0x0d8733f,0x09179d6,0x1ad139b,0x372e962,0x0bad933,
-        0x1961102,0x223cdff,0x37e9eb2,0x0218fae } },
-    /* 3 */
-    { { 0x0db6485,0x1ad88d7,0x2f97785,0x288bc28,0x3808f0e,0x3df8c02,
-        0x28d9544,0x20280f9,0x055b5ff,0x00001d8 },
-      { 0x38d2010,0x13ae6e0,0x308a763,0x2ecc90d,0x254014f,0x10a9981,
-        0x247d398,0x0fb8383,0x3613437,0x020c21d } },
-    /* 4 */
-    { { 0x2a0d2bb,0x08bf145,0x34994f9,0x1b06988,0x30d5cc1,0x1f18b22,
-        0x01cf3a5,0x199fe49,0x161fd1b,0x00bd79a },
-      { 0x1a01797,0x171c2fd,0x21925c1,0x1358255,0x23d20b4,0x1c7f6d4,
-        0x111b370,0x03dec12,0x1168d6f,0x03d923e } },
-    /* 5 */
-    { { 0x137bbbc,0x19a11f8,0x0bec9e5,0x27a29a8,0x3e43446,0x275cd18,
-        0x0427617,0x00056c7,0x285133d,0x016af80 },
-      { 0x04c7dab,0x2a0df30,0x0c0792a,0x1310c98,0x3573d9f,0x239b30d,
-        0x1315627,0x1ce0c32,0x25b6b6f,0x0252edc } },
-    /* 6 */
-    { { 0x20f141c,0x26d23dc,0x3c74bbf,0x334b7d6,0x06199b3,0x0441171,
-        0x3f61294,0x313bf70,0x3cb2f7d,0x03375ae },
-      { 0x2f436fd,0x19c02fa,0x26becca,0x1b6e64c,0x26f647f,0x053c948,
-        0x0fa7920,0x397d830,0x2bd4bda,0x028d86f } },
-    /* 7 */
-    { { 0x17c13c7,0x2895616,0x03e128a,0x17d42df,0x1c38d63,0x0f02747,
-        0x039aecf,0x0a4b01c,0x209c4b5,0x02e84b2 },
-      { 0x1f91dfd,0x023e916,0x07fb9e4,0x19b3ba8,0x13af43b,0x35e02ca,
-        0x0eb0899,0x3bd2c7b,0x19d701f,0x014faee } },
-    /* 8 */
-    { { 0x0e63d34,0x1fb8c6c,0x0fab4fe,0x1caa795,0x0f46005,0x179ed69,
-        0x093334d,0x120c701,0x39206d5,0x021627e },
-      { 0x183553a,0x03d7319,0x09e5aa7,0x12b8959,0x2087909,0x0011194,
-        0x1045071,0x0713f32,0x16d0254,0x03aec1a } },
-    /* 9 */
-    { { 0x01647c5,0x1b2856b,0x1799461,0x11f133d,0x0b8127d,0x1937eeb,
-        0x266aa37,0x1f68f71,0x0cbd1b2,0x03aca08 },
-      { 0x287e008,0x1be361a,0x38f3940,0x276488d,0x2d87dfa,0x0333b2c,
-        0x2d2e428,0x368755b,0x09b55a7,0x007ca0a } },
-    /* 10 */
-    { { 0x389da99,0x2a8300e,0x0022abb,0x27ae0a1,0x0a6f2d7,0x207017a,
-        0x047862b,0x1358c9e,0x35905e5,0x00cde92 },
-      { 0x1f7794a,0x1d40348,0x3f613c6,0x2ddf5b5,0x0207005,0x133f5ba,
-        0x1a37810,0x3ef5829,0x0d5f4c2,0x0035978 } },
-    /* 11 */
-    { { 0x1275d38,0x026efad,0x2358d9d,0x1142f82,0x14268a7,0x1cfac99,
-        0x362ff49,0x288cbc1,0x24252f4,0x0308f68 },
-      { 0x394520c,0x06e13c2,0x178e5da,0x18ec16f,0x1096667,0x134a7a8,
-        0x0dcb869,0x33fc4e9,0x38cc790,0x006778e } },
-    /* 12 */
-    { { 0x2c5fe04,0x29c5b09,0x1bdb183,0x02ceee8,0x03b28de,0x132dc4b,
-        0x32c586a,0x32ff5d0,0x3d491fc,0x038d372 },
-      { 0x2a58403,0x2351aea,0x3a53b40,0x21a0ba5,0x39a6974,0x1aaaa2b,
-        0x3901273,0x03dfe78,0x3447b4e,0x039d907 } },
-    /* 13 */
-    { { 0x364ba59,0x14e5077,0x02fc7d7,0x3b02c09,0x1d33f10,0x0560616,
-        0x06dfc6a,0x15efd3c,0x357052a,0x01284b7 },
-      { 0x039dbd0,0x18ce3e5,0x3e1fbfa,0x352f794,0x0d3c24b,0x07c6cc5,
-        0x1e4ffa2,0x3a91bf5,0x293bb5b,0x01abd6a } },
-    /* 14 */
-    { { 0x0c91999,0x02da644,0x0491da1,0x100a960,0x00a24b4,0x2330824,
-        0x0094b4b,0x1004cf8,0x35a66a4,0x017f8d1 },
-      { 0x13e7b4b,0x232af7e,0x391ab0f,0x069f08f,0x3292b50,0x3479898,
-        0x2889aec,0x2a4590b,0x308ecfe,0x02d5138 } },
-    /* 15 */
-    { { 0x2ddfdce,0x231ba45,0x39e6647,0x19be245,0x12c3291,0x35399f8,
-        0x0d6e764,0x3082d3a,0x2bda6b0,0x0382dac },
-      { 0x37efb57,0x04b7cae,0x00070d3,0x379e431,0x01aac0d,0x1e6f251,
-        0x0336ad6,0x0ddd3e4,0x3de25a6,0x01c7008 } },
-    /* 16 */
-    { { 0x3e20925,0x230912f,0x286762a,0x30e3f73,0x391c19a,0x34e1c18,
-        0x16a5d5d,0x093d96a,0x3d421d3,0x0187561 },
-      { 0x37173ea,0x19ce8a8,0x0b65e87,0x0214dde,0x2238480,0x16ead0f,
-        0x38441e0,0x3bef843,0x2124621,0x03e847f } },
-    /* 17 */
-    { { 0x0b19ffd,0x247cacb,0x3c231c8,0x16ec648,0x201ba8d,0x2b172a3,
-        0x103d678,0x2fb72db,0x04c1f13,0x0161bac },
-      { 0x3e8ed09,0x171b949,0x2de20c3,0x0f06067,0x21e81a3,0x1b194be,
-        0x0fd6c05,0x13c449e,0x0087086,0x006756b } },
-    /* 18 */
-    { { 0x09a4e1f,0x27d604c,0x00741e9,0x06fa49c,0x0ab7de7,0x3f4a348,
-        0x25ef0be,0x158fc9a,0x33f7f9c,0x039f001 },
-      { 0x2f59f76,0x3598e83,0x30501f6,0x15083f2,0x0669b3b,0x29980b5,
-        0x0c1f7a7,0x0f02b02,0x0fec65b,0x0382141 } },
-    /* 19 */
-    { { 0x031b3ca,0x23da368,0x2d66f09,0x27b9b69,0x06d1cab,0x13c91ba,
-        0x3d81fa9,0x25ad16f,0x0825b09,0x01e3c06 },
-      { 0x225787f,0x3bf790e,0x2c9bb7e,0x0347732,0x28016f8,0x0d6ff0d,
-        0x2a4877b,0x1d1e833,0x3b87e94,0x010e9dc } },
-    /* 20 */
-    { { 0x2b533d5,0x1ddcd34,0x1dc0625,0x3da86f7,0x3673b8a,0x1e7b0a4,
-        0x3e7c9aa,0x19ac55d,0x251c3b2,0x02edb79 },
-      { 0x25259b3,0x24c0ead,0x3480e7e,0x34f40e9,0x3d6a0af,0x2cf3f09,
-        0x2c83d19,0x2e66f16,0x19a5d18,0x0182d18 } },
-    /* 21 */
-    { { 0x2e5aa1c,0x28e3846,0x3658bd6,0x0ad279c,0x1b8b765,0x397e1fb,
-        0x130014e,0x3ff342c,0x3b2aeeb,0x02743c9 },
-      { 0x2730a55,0x0918c5e,0x083aca9,0x0bf76ef,0x19c955b,0x300669c,
-        0x01dfe0a,0x312341f,0x26d356e,0x0091295 } },
-    /* 22 */
-    { { 0x2cf1f96,0x00e52ba,0x271c6db,0x2a40930,0x19f2122,0x0b2f4ee,
-        0x26ac1b8,0x3bda498,0x0873581,0x0117963 },
-      { 0x38f9dbc,0x3d1e768,0x2040d3f,0x11ba222,0x3a8aaf1,0x1b82fb5,
-        0x1adfb24,0x2de9251,0x21cc1e4,0x0301038 } },
-    /* 23 */
-    { { 0x38117b6,0x2bc001b,0x1433847,0x3fdce8d,0x3651969,0x3651d7a,
-        0x2b35761,0x1bb1d20,0x097682c,0x00737d7 },
-      { 0x1f04839,0x1dd6d04,0x16987db,0x3d12378,0x17dbeac,0x1c2cc86,
-        0x121dd1b,0x3fcf6ca,0x1f8a92d,0x00119d5 } },
-    /* 24 */
-    { { 0x0e8ffcd,0x2b174af,0x1a82cc8,0x22cbf98,0x30d53c4,0x080b5b1,
-        0x3161727,0x297cfdb,0x2113b83,0x0011b97 },
-      { 0x0007f01,0x23fd936,0x3183e7b,0x0496bd0,0x07fb1ef,0x178680f,
-        0x1c5ea63,0x0016c11,0x2c3303d,0x01b8041 } },
-    /* 25 */
-    { { 0x0dd73b1,0x1cd6122,0x10d948c,0x23e657b,0x3767070,0x15a8aad,
-        0x385ea8c,0x33c7ce0,0x0ede901,0x0110965 },
-      { 0x2d4b65b,0x2a8b244,0x0c37f8f,0x0ee5b24,0x394c234,0x3a5e347,
-        0x26e4a15,0x39a3b4c,0x2514c2e,0x029e5be } },
-    /* 26 */
-    { { 0x23addd7,0x3ed8120,0x13b3359,0x20f959a,0x09e2a61,0x32fcf20,
-        0x05b78e3,0x19ba7e2,0x1a9c697,0x0392b4b },
-      { 0x2048a61,0x3dfd0a3,0x19a0357,0x233024b,0x3082d19,0x00fb63b,
-        0x3a1af4c,0x1450ff0,0x046c37b,0x0317a50 } },
-    /* 27 */
-    { { 0x3e75f9e,0x294e30a,0x3a78476,0x3a32c48,0x36fd1a9,0x0427012,
-        0x1e4df0b,0x11d1f61,0x1afdb46,0x018ca0f },
-      { 0x2f2df15,0x0a33dee,0x27f4ce7,0x1542b66,0x3e592c4,0x20d2f30,
-        0x3226ade,0x2a4e3ea,0x1ab1981,0x01a2f46 } },
-    /* 28 */
-    { { 0x087d659,0x3ab5446,0x305ac08,0x3d2cd64,0x33374d5,0x3f9d3f8,
-        0x186981c,0x37f5a5a,0x2f53c6f,0x01254a4 },
-      { 0x2cec896,0x1e32786,0x04844a8,0x043b16d,0x3d964b2,0x1935829,
-        0x16f7e26,0x1a0dd9a,0x30d2603,0x003b1d4 } },
-    /* 29 */
-    { { 0x12687bb,0x04e816b,0x21fa2da,0x1abccb8,0x3a1f83b,0x375181e,
-        0x0f5ef51,0x0fc2ce4,0x3a66486,0x003d881 },
-      { 0x3138233,0x1f8eec3,0x2718bd6,0x1b09caa,0x2dd66b9,0x1bb222b,
-        0x1004072,0x1b73e3b,0x07208ed,0x03fc36c } },
-    /* 30 */
-    { { 0x095d553,0x3e84053,0x0a8a749,0x3f575a0,0x3a44052,0x3ced59b,
-        0x3b4317f,0x03a8c60,0x13c8874,0x00c4ed4 },
-      { 0x0d11549,0x0b8ab02,0x221cb40,0x02ed37b,0x2071ee1,0x1fc8c83,
-        0x3987dd4,0x27e049a,0x0f986f1,0x00b4eaf } },
-    /* 31 */
-    { { 0x15581a2,0x2214060,0x11af4c2,0x1598c88,0x19a0a6d,0x32acba6,
-        0x3a7a0f0,0x2337c66,0x210ded9,0x0300dbe },
-      { 0x1fbd009,0x3822eb0,0x181629a,0x2401b45,0x30b68b1,0x2e78363,
-        0x2b32779,0x006530b,0x2c4b6d4,0x029aca8 } },
-    /* 32 */
-    { { 0x13549cf,0x0f943db,0x265ed43,0x1bfeb35,0x06f3369,0x3847f2d,
-        0x1bfdacc,0x26181a5,0x252af7c,0x02043b8 },
-      { 0x159bb2c,0x143f85c,0x357b654,0x2f9d62c,0x2f7dfbe,0x1a7fa9c,
-        0x057e74d,0x05d14ac,0x17a9273,0x035215c } },
-    /* 33 */
-    { { 0x0cb5a98,0x106a2bc,0x10bf117,0x24c7cc4,0x3d3da8f,0x2ce0ab7,
-        0x14e2cba,0x1813866,0x1a72f9a,0x01a9811 },
-      { 0x2b2411d,0x3034fe8,0x16e0170,0x0f9443a,0x0be0eb8,0x2196cf3,
-        0x0c9f738,0x15e40ef,0x0faf9e1,0x034f917 } },
-    /* 34 */
-    { { 0x03f7669,0x3da6efa,0x3d6bce1,0x209ca1d,0x109f8ae,0x09109e3,
-        0x08ae543,0x3067255,0x1dee3c2,0x0081dd5 },
-      { 0x3ef1945,0x358765b,0x28c387b,0x3bec4b4,0x218813c,0x0b7d92a,
-        0x3cd1d67,0x2c0367e,0x2e57154,0x0123717 } },
-    /* 35 */
-    { { 0x3e5a199,0x1e42ffd,0x0bb7123,0x33e6273,0x1e0efb8,0x294671e,
-        0x3a2bfe0,0x3d11709,0x2eddff6,0x03cbec2 },
-      { 0x0b5025f,0x0255d7c,0x1f2241c,0x35d03ea,0x0550543,0x202fef4,
-        0x23c8ad3,0x354963e,0x015db28,0x0284fa4 } },
-    /* 36 */
-    { { 0x2b65cbc,0x1e8d428,0x0226f9f,0x1c8a919,0x10b04b9,0x08fc1e8,
-        0x1ce241e,0x149bc99,0x2b01497,0x00afc35 },
-      { 0x3216fb7,0x1374fd2,0x226ad3d,0x19fef76,0x0f7d7b8,0x1c21417,
-        0x37b83f6,0x3a27eba,0x25a162f,0x010aa52 } },
-    /* 37 */
-    { { 0x2adf191,0x1ab42fa,0x28d7584,0x2409689,0x20f8a48,0x253707d,
-        0x2030504,0x378f7a1,0x169c65e,0x00b0b76 },
-      { 0x3849c17,0x085c764,0x10dd6d0,0x2e87689,0x1460488,0x30e9521,
-        0x10c7063,0x1b6f120,0x21f42c5,0x03d0dfe } },
-    /* 38 */
-    { { 0x20f7dab,0x035c512,0x29ac6aa,0x24c5ddb,0x20f0497,0x17ce5e1,
-        0x00a050f,0x1eaa14b,0x3335470,0x02abd16 },
-      { 0x18d364a,0x0df0cf0,0x316585e,0x018f925,0x0d40b9b,0x17b1511,
-        0x1716811,0x1caf3d0,0x10df4f2,0x0337d8c } },
-    /* 39 */
-    { { 0x2a8b7ef,0x0f188e3,0x2287747,0x06216f0,0x008e935,0x2f6a38d,
-        0x1567722,0x0bfc906,0x0bada9e,0x03c3402 },
-      { 0x014d3b1,0x099c749,0x2a76291,0x216c067,0x3b37549,0x14ef2f6,
-        0x21b96d4,0x1ee2d71,0x2f5ca88,0x016f570 } },
-    /* 40 */
-    { { 0x09a3154,0x3d1a7bd,0x2e9aef0,0x255b8ac,0x03e85a5,0x2a492a7,
-        0x2aec1ea,0x11c6516,0x3c8a09e,0x02a84b7 },
-      { 0x1f69f1d,0x09c89d3,0x1e7326f,0x0b28bfd,0x0e0e4c8,0x1ea7751,
-        0x18ce73b,0x2a406e7,0x273e48c,0x01b00db } },
-    /* 41 */
-    { { 0x36e3138,0x2b84a83,0x345a5cf,0x00096b4,0x16966ef,0x159caf1,
-        0x13c64b4,0x2f89226,0x25896af,0x00a4bfd },
-      { 0x2213402,0x1435117,0x09fed52,0x09d0e4b,0x0f6580e,0x2871cba,
-        0x3b397fd,0x1c9d825,0x090311b,0x0191383 } },
-    /* 42 */
-    { { 0x07153f0,0x1087869,0x18c9e1e,0x1e64810,0x2b86c3b,0x0175d9c,
-        0x3dce877,0x269de4e,0x393cab7,0x03c96b9 },
-      { 0x1869d0c,0x06528db,0x02641f3,0x209261b,0x29d55c8,0x25ba517,
-        0x3b5ea30,0x028f927,0x25313db,0x00e6e39 } },
-    /* 43 */
-    { { 0x2fd2e59,0x150802d,0x098f377,0x19a4957,0x135e2c0,0x38a95ce,
-        0x1ab21a0,0x36c1b67,0x32f0f19,0x00e448b },
-      { 0x3cad53c,0x3387800,0x17e3cfb,0x03f9970,0x3225b2c,0x2a84e1d,
-        0x3af1d29,0x3fe35ca,0x2f8ce80,0x0237a02 } },
-    /* 44 */
-    { { 0x07bbb76,0x3aa3648,0x2758afb,0x1f085e0,0x1921c7e,0x3010dac,
-        0x22b74b1,0x230137e,0x1062e36,0x021c652 },
-      { 0x3993df5,0x24a2ee8,0x126ab5f,0x2d7cecf,0x0639d75,0x16d5414,
-        0x1aa78a8,0x3f78404,0x26a5b74,0x03f0c57 } },
-    /* 45 */
-    { { 0x0d6ecfa,0x3f506ba,0x3f86561,0x3d86bb1,0x15f8c44,0x2491d07,
-        0x052a7b4,0x2422261,0x3adee38,0x039b529 },
-      { 0x193c75d,0x14bb451,0x1162605,0x293749c,0x370a70d,0x2e8b1f6,
-        0x2ede937,0x2b95f4a,0x39a9be2,0x00d77eb } },
-    /* 46 */
-    { { 0x2736636,0x15bf36a,0x2b7e6b9,0x25eb8b2,0x209f51d,0x3cd2659,
-        0x10bf410,0x034afec,0x3d71c83,0x0076971 },
-      { 0x0ce6825,0x07920cf,0x3c3b5c4,0x23fe55c,0x015ad11,0x08c0dae,
-        0x0552c7f,0x2e75a8a,0x0fddbf4,0x01c1df0 } },
-    /* 47 */
-    { { 0x2b9661c,0x0ffe351,0x3d71bf6,0x1ac34b3,0x3a1dfd3,0x211fe3d,
-        0x33e140a,0x3f9100d,0x32ee50e,0x014ea18 },
-      { 0x16d8051,0x1bfda1a,0x068a097,0x2571d3d,0x1daec0c,0x39389af,
-        0x194dc35,0x3f3058a,0x36d34e1,0x000a329 } },
-    /* 48 */
-    { { 0x09877ee,0x351f73f,0x0002d11,0x0420074,0x2c8b362,0x130982d,
-        0x02c1175,0x3c11b40,0x0d86962,0x001305f },
-      { 0x0daddf5,0x2f4252c,0x15c06d9,0x1d49339,0x1bea235,0x0b680ed,
-        0x3356e67,0x1d1d198,0x1e9fed9,0x03dee93 } },
-    /* 49 */
-    { { 0x3e1263f,0x2fe8d3a,0x3ce6d0d,0x0d5c6b9,0x3557637,0x0a9bd48,
-        0x0405538,0x0710749,0x2005213,0x038c7e5 },
-      { 0x26b6ec6,0x2e485ba,0x3c44d1b,0x0b9cf0b,0x037a1d1,0x27428a5,
-        0x0e7eac8,0x351ef04,0x259ce34,0x02a8e98 } },
-    /* 50 */
-    { { 0x2f3dcd3,0x3e77d4d,0x3360fbc,0x1434afd,0x36ceded,0x3d413d6,
-        0x1710fad,0x36bb924,0x1627e79,0x008e637 },
-      { 0x109569e,0x1c168db,0x3769cf4,0x2ed4527,0x0ea0619,0x17d80d3,
-        0x1c03773,0x18843fe,0x1b21c04,0x015c5fd } },
-    /* 51 */
-    { { 0x1dd895e,0x08a7248,0x04519fe,0x001030a,0x18e5185,0x358dfb3,
-        0x13d2391,0x0a37be8,0x0560e3c,0x019828b },
-      { 0x27fcbd0,0x2a22bb5,0x30969cc,0x1e03aa7,0x1c84724,0x0ba4ad3,
-        0x32f4817,0x0914cca,0x14c4f52,0x01893b9 } },
-    /* 52 */
-    { { 0x097eccc,0x1273936,0x00aa095,0x364fe62,0x04d49d1,0x10e9f08,
-        0x3c24230,0x3ef01c8,0x2fb92bd,0x013ce4a },
-      { 0x1e44fd9,0x27e3e9f,0x2156696,0x3915ecc,0x0b66cfb,0x1a3af0f,
-        0x2fa8033,0x0e6736c,0x177ccdb,0x0228f9e } },
-    /* 53 */
-    { { 0x2c4b125,0x06207c1,0x0a8cdde,0x003db8f,0x1ae34e3,0x31e84fa,
-        0x2999de5,0x11013bd,0x02370c2,0x00e2234 },
-      { 0x0f91081,0x200d591,0x1504762,0x1857c05,0x23d9fcf,0x0cb34db,
-        0x27edc86,0x08cd860,0x2471810,0x029798b } },
-    /* 54 */
-    { { 0x3acd6c8,0x097b8cb,0x3c661a8,0x15152f2,0x1699c63,0x237e64c,
-        0x23edf79,0x16b7033,0x0e6466a,0x00b11da },
-      { 0x0a64bc9,0x1bfe324,0x1f5cb34,0x08391de,0x0630a60,0x3017a21,
-        0x09d064b,0x14a8365,0x041f9e6,0x01ed799 } },
-    /* 55 */
-    { { 0x128444a,0x2508b07,0x2a39216,0x362f84d,0x2e996c5,0x2c31ff3,
-        0x07afe5f,0x1d1288e,0x3cb0c8d,0x02e2bdc },
-      { 0x38b86fd,0x3a0ea8c,0x1cff5fd,0x1629629,0x3fee3f1,0x02b250c,
-        0x2e8f6f2,0x0225727,0x15f7f3f,0x0280d8e } },
-    /* 56 */
-    { { 0x10f7770,0x0f1aee8,0x0e248c7,0x20684a8,0x3a6f16d,0x06f0ae7,
-        0x0df6825,0x2d4cc40,0x301875f,0x012f8da },
-      { 0x3b56dbb,0x1821ba7,0x24f8922,0x22c1f9e,0x0306fef,0x1b54bc8,
-        0x2ccc056,0x00303ba,0x2871bdc,0x0232f26 } },
-    /* 57 */
-    { { 0x0dac4ab,0x0625730,0x3112e13,0x101c4bf,0x3a874a4,0x2873b95,
-        0x32ae7c6,0x0d7e18c,0x13e0c08,0x01139d5 },
-      { 0x334002d,0x00fffdd,0x025c6d5,0x22c2cd1,0x19d35cb,0x3a1ce2d,
-        0x3702760,0x3f06257,0x03a5eb8,0x011c29a } },
-    /* 58 */
-    { { 0x0513482,0x1d87724,0x276a81b,0x0a807a4,0x3028720,0x339cc20,
-        0x2441ee0,0x31bbf36,0x290c63d,0x0059041 },
-      { 0x106a2ed,0x0d2819b,0x100bf50,0x114626c,0x1dd4d77,0x2e08632,
-        0x14ae72a,0x2ed3f64,0x1fd7abc,0x035cd1e } },
-    /* 59 */
-    { { 0x2d4c6e5,0x3bec596,0x104d7ed,0x23d6c1b,0x0262cf0,0x15d72c5,
-        0x2d5bb18,0x199ac4b,0x1e30771,0x020591a },
-      { 0x21e291e,0x2e75e55,0x1661d7a,0x08b0778,0x3eb9daf,0x0d78144,
-        0x1827eb1,0x0fe73d2,0x123f0dd,0x0028db7 } },
-    /* 60 */
-    { { 0x1d5533c,0x34cb1d0,0x228f098,0x27a1a11,0x17c5f5a,0x0d26f44,
-        0x2228ade,0x2c460e6,0x3d6fdba,0x038cc77 },
-      { 0x3cc6ed8,0x02ada1a,0x260e510,0x2f7bde8,0x37160c3,0x33a1435,
-        0x23d9a7b,0x0ce2641,0x02a492e,0x034ed1e } },
-    /* 61 */
-    { { 0x3821f90,0x26dba3c,0x3aada14,0x3b59bad,0x292edd9,0x2804c45,
-        0x3669531,0x296f42e,0x35a4c86,0x01ca049 },
-      { 0x3ff47e5,0x2163df4,0x2441503,0x2f18405,0x15e1616,0x37f66ec,
-        0x30f11a7,0x141658a,0x27ece14,0x00b018b } },
-    /* 62 */
-    { { 0x159ac2e,0x3e65bc0,0x2713a76,0x0db2f6c,0x3281e77,0x2391811,
-        0x16d2880,0x1fbc4ab,0x1f92c4e,0x00a0a8d },
-      { 0x0ce5cd2,0x152c7b0,0x02299c3,0x3244de7,0x2cf99ef,0x3a0b047,
-        0x2caf383,0x0aaf664,0x113554d,0x031c735 } },
-    /* 63 */
-    { { 0x1b578f4,0x177a702,0x3a7a488,0x1638ebf,0x31884e2,0x2460bc7,
-        0x36b1b75,0x3ce8e3d,0x340cf47,0x03143d9 },
-      { 0x34b68ea,0x12b7ccd,0x1fe2a9c,0x08da659,0x0a406f3,0x1694c14,
-        0x06a2228,0x16370be,0x3a72129,0x02e7b2c } },
-    /* 64 */
-    { { 0x0f8b16a,0x21043bd,0x266a56f,0x3fb11ec,0x197241a,0x36721f0,
-        0x006b8e6,0x2ac6c29,0x202cd42,0x0200fcf },
-      { 0x0dbec69,0x0c26a01,0x105f7f0,0x3dceeeb,0x3a83b85,0x363865f,
-        0x097273a,0x2b70718,0x00e5067,0x03025d1 } },
-    /* 65 */
-    { { 0x379ab34,0x295bcb0,0x38d1846,0x22e1077,0x3a8ee06,0x1db1a3b,
-        0x3144591,0x07cc080,0x2d5915f,0x03c6bcc },
-      { 0x175bd50,0x0dd4c57,0x27bc99c,0x2ebdcbd,0x3837cff,0x235dc8f,
-        0x13a4184,0x0722c18,0x130e2d4,0x008f43c } },
-    /* 66 */
-    { { 0x01500d9,0x2adbb7d,0x2da8857,0x397f2fa,0x10d890a,0x25c9654,
-        0x3e86488,0x3eb754b,0x1d6c0a3,0x02c0a23 },
-      { 0x10bcb08,0x083cc19,0x2e16853,0x04da575,0x271af63,0x2626a9d,
-        0x3520a7b,0x32348c7,0x24ff408,0x03ff4dc } },
-    /* 67 */
-    { { 0x058e6cb,0x1a3992d,0x1d28539,0x080c5e9,0x2992dad,0x2a9d7d5,
-        0x14ae0b7,0x09b7ce0,0x34ad78c,0x03d5643 },
-      { 0x30ba55a,0x092f4f3,0x0bae0fc,0x12831de,0x20fc472,0x20ed9d2,
-        0x29864f6,0x1288073,0x254f6f7,0x00635b6 } },
-    /* 68 */
-    { { 0x1be5a2b,0x0f88975,0x33c6ed9,0x20d64d3,0x06fe799,0x0989bff,
-        0x1409262,0x085a90c,0x0d97990,0x0142eed },
-      { 0x17ec63e,0x06471b9,0x0db2378,0x1006077,0x265422c,0x08db83d,
-        0x28099b0,0x1270d06,0x11801fe,0x00ac400 } },
-    /* 69 */
-    { { 0x3391593,0x22d7166,0x30fcfc6,0x2896609,0x3c385f5,0x066b72e,
-        0x04f3aad,0x2b831c5,0x19983fb,0x0375562 },
-      { 0x0b82ff4,0x222e39d,0x34c993b,0x101c79c,0x2d2e03c,0x0f00c8a,
-        0x3a9eaf4,0x1810669,0x151149d,0x039b931 } },
-    /* 70 */
-    { { 0x29af288,0x1956ec7,0x293155f,0x193deb6,0x1647e1a,0x2ca0839,
-        0x297e4bc,0x15bfd0d,0x1b107ed,0x0147803 },
-      { 0x31c327e,0x05a6e1d,0x02ad43d,0x02d2a5b,0x129cdb2,0x37ad1de,
-        0x3d51f53,0x245df01,0x2414982,0x0388bd0 } },
-    /* 71 */
-    { { 0x35f1abb,0x17a3d18,0x0874cd4,0x2d5a14e,0x17edc0c,0x16a00d3,
-        0x072c1fb,0x1232725,0x33d52dc,0x03dc24d },
-      { 0x0af30d6,0x259aeea,0x369c401,0x12bc4de,0x295bf5f,0x0d8711f,
-        0x26162a9,0x16c44e5,0x288e727,0x02f54b4 } },
-    /* 72 */
-    { { 0x05fa877,0x1571ea7,0x3d48ab1,0x1c9f4e8,0x017dad6,0x0f46276,
-        0x343f9e7,0x1de990f,0x0e4c8aa,0x028343e },
-      { 0x094f92d,0x3abf633,0x1b3a0bb,0x2f83137,0x0d818c8,0x20bae85,
-        0x0c65f8b,0x1a8008b,0x0c7946d,0x0295b1e } },
-    /* 73 */
-    { { 0x1d09529,0x08e46c3,0x1fcf296,0x298f6b7,0x1803e0e,0x2d6fd20,
-        0x37351f5,0x0d9e8b1,0x1f8731a,0x0362fbf },
-      { 0x00157f4,0x06750bf,0x2650ab9,0x35ffb23,0x2f51cae,0x0b522c2,
-        0x39cb400,0x191e337,0x0a5ce9f,0x021529a } },
-    /* 74 */
-    { { 0x3506ea5,0x17d9ed8,0x0d66dc3,0x22693f8,0x19286c4,0x3a57353,
-        0x101d3bf,0x1aa54fc,0x20b9884,0x0172b3a },
-      { 0x0eac44d,0x37d8327,0x1c3aa90,0x3d0d534,0x23db29a,0x3576eaf,
-        0x1d3de8a,0x3bea423,0x11235e4,0x039260b } },
-    /* 75 */
-    { { 0x34cd55e,0x01288b0,0x1132231,0x2cc9a03,0x358695b,0x3e87650,
-        0x345afa1,0x01267ec,0x3f616b2,0x02011ad },
-      { 0x0e7d098,0x0d6078e,0x0b70b53,0x237d1bc,0x0d7f61e,0x132de31,
-        0x1ea9ea4,0x2bd54c3,0x27b9082,0x03ac5f2 } },
-    /* 76 */
-    { { 0x2a145b9,0x06d661d,0x31ec175,0x03f06f1,0x3a5cf6b,0x249c56e,
-        0x2035653,0x384c74f,0x0bafab5,0x0025ec0 },
-      { 0x25f69e1,0x1b23a55,0x1199aa6,0x16ad6f9,0x077e8f7,0x293f661,
-        0x33ba11d,0x3327980,0x07bafdb,0x03e571d } },
-    /* 77 */
-    { { 0x2bae45e,0x3c074ef,0x2955558,0x3c312f1,0x2a8ebe9,0x2f193f1,
-        0x3705b1d,0x360deba,0x01e566e,0x00d4498 },
-      { 0x21161cd,0x1bc787e,0x2f87933,0x3553197,0x1328ab8,0x093c879,
-        0x17eee27,0x2adad1d,0x1236068,0x003be5c } },
-    /* 78 */
-    { { 0x0ca4226,0x2633dd5,0x2c8e025,0x0e3e190,0x05eede1,0x1a385e4,
-        0x163f744,0x2f25522,0x1333b4f,0x03f05b6 },
-      { 0x3c800ca,0x1becc79,0x2daabe9,0x0c499e2,0x1138063,0x3fcfa2d,
-        0x2244976,0x1e85cf5,0x2f1b95d,0x0053292 } },
-    /* 79 */
-    { { 0x12f81d5,0x1dc6eaf,0x11967a4,0x1a407df,0x31a5f9d,0x2b67241,
-        0x18bef7c,0x08c7762,0x063f59c,0x01015ec },
-      { 0x1c05c0a,0x360bfa2,0x1f85bff,0x1bc7703,0x3e4911c,0x0d685b6,
-        0x2fccaea,0x02c4cef,0x164f133,0x0070ed7 } },
-    /* 80 */
-    { { 0x0ec21fe,0x052ffa0,0x3e825fe,0x1ab0956,0x3f6ce11,0x3d29759,
-        0x3c5a072,0x18ebe62,0x148db7e,0x03eb49c },
-      { 0x1ab05b3,0x02dab0a,0x1ae690c,0x0f13894,0x137a9a8,0x0aab79f,
-        0x3dc875c,0x06a1029,0x1e39f0e,0x01dce1f } },
-    /* 81 */
-    { { 0x16c0dd7,0x3b31269,0x2c741e9,0x3611821,0x2a5cffc,0x1416bb3,
-        0x3a1408f,0x311fa3d,0x1c0bef0,0x02cdee1 },
-      { 0x00e6a8f,0x1adb933,0x0f23359,0x2fdace2,0x2fd6d4b,0x0e73bd3,
-        0x2453fac,0x0a356ae,0x2c8f9f6,0x02704d6 } },
-    /* 82 */
-    { { 0x0e35743,0x28c80a1,0x0def32a,0x2c6168f,0x1320d6a,0x37c6606,
-        0x21b1761,0x2147ee0,0x21fc433,0x015c84d },
-      { 0x1fc9168,0x36cda9c,0x003c1f0,0x1cd7971,0x15f98ba,0x1ef363d,
-        0x0ca87e3,0x046f7d9,0x3c9e6bb,0x0372eb0 } },
-    /* 83 */
-    { { 0x118cbe2,0x3665a11,0x304ef01,0x062727a,0x3d242fc,0x11ffbaf,
-        0x3663c7e,0x1a189c9,0x09e2d62,0x02e3072 },
-      { 0x0e1d569,0x162f772,0x0cd051a,0x322df62,0x3563809,0x047cc7a,
-        0x027fd9f,0x08b509b,0x3da2f94,0x01748ee } },
-    /* 84 */
-    { { 0x1c8f8be,0x31ca525,0x22bf0a1,0x200efcd,0x02961c4,0x3d8f52b,
-        0x018403d,0x3a40279,0x1cb91ec,0x030427e },
-      { 0x0945705,0x0257416,0x05c0c2d,0x25b77ae,0x3b9083d,0x2901126,
-        0x292b8d7,0x07b8611,0x04f2eee,0x026f0cd } },
-    /* 85 */
-    { { 0x2913074,0x2b8d590,0x02b10d5,0x09d2295,0x255491b,0x0c41cca,
-        0x1ca665b,0x133051a,0x1525f1a,0x00a5647 },
-      { 0x04f983f,0x3d6daee,0x04e1e76,0x1067d7e,0x1be7eef,0x02ea862,
-        0x00d4968,0x0ccb048,0x11f18ef,0x018dd95 } },
-    /* 86 */
-    { { 0x22976cc,0x17c5395,0x2c38bda,0x3983bc4,0x222bca3,0x332a614,
-        0x3a30646,0x261eaef,0x1c808e2,0x02f6de7 },
-      { 0x306a772,0x32d7272,0x2dcefd2,0x2abf94d,0x038f475,0x30ad76e,
-        0x23e0227,0x3052b0a,0x001add3,0x023ba18 } },
-    /* 87 */
-    { { 0x0ade873,0x25a6069,0x248ccbe,0x13713ee,0x17ee9aa,0x28152e9,
-        0x2e28995,0x2a92cb3,0x17a6f77,0x024b947 },
-      { 0x190a34d,0x2ebea1c,0x1ed1948,0x16fdaf4,0x0d698f7,0x32bc451,
-        0x0ee6e30,0x2aaab40,0x06f0a56,0x01460be } },
-    /* 88 */
-    { { 0x24cc99c,0x1884b1e,0x1ca1fba,0x1a0f9b6,0x2ff609b,0x2b26316,
-        0x3b27cb5,0x29bc976,0x35d4073,0x024772a },
-      { 0x3575a70,0x1b30f57,0x07fa01b,0x0e5be36,0x20cb361,0x26605cd,
-        0x1d4e8c8,0x13cac59,0x2db9797,0x005e833 } },
-    /* 89 */
-    { { 0x36c8d3a,0x1878a81,0x124b388,0x0e4843e,0x1701aad,0x0ea0d76,
-        0x10eae41,0x37d0653,0x36c7f4c,0x00ba338 },
-      { 0x37a862b,0x1cf6ac0,0x08fa912,0x2dd8393,0x101ba9b,0x0eebcb7,
-        0x2453883,0x1a3cfe5,0x2cb34f6,0x03d3331 } },
-    /* 90 */
-    { { 0x1f79687,0x3d4973c,0x281544e,0x2564bbe,0x17c5954,0x171e34a,
-        0x231741a,0x3cf2784,0x0889a0d,0x02b036d },
-      { 0x301747f,0x3f1c477,0x1f1386b,0x163bc5f,0x1592b93,0x332daed,
-        0x080e4f5,0x1d28b96,0x26194c9,0x0256992 } },
-    /* 91 */
-    { { 0x15a4c93,0x07bf6b0,0x114172c,0x1ce0961,0x140269b,0x1b2c2eb,
-        0x0dfb1c1,0x019ddaa,0x0ba2921,0x008c795 },
-      { 0x2e6d2dc,0x37e45e2,0x2918a70,0x0fce444,0x34d6aa6,0x396dc88,
-        0x27726b5,0x0c787d8,0x032d8a7,0x02ac2f8 } },
-    /* 92 */
-    { { 0x1131f2d,0x2b43a63,0x3101097,0x38cec13,0x0637f09,0x17a69d2,
-        0x086196d,0x299e46b,0x0802cf6,0x03c6f32 },
-      { 0x0daacb4,0x1a4503a,0x100925c,0x15583d9,0x23c4e40,0x1de4de9,
-        0x1cc8fc4,0x2c9c564,0x0695aeb,0x02145a5 } },
-    /* 93 */
-    { { 0x1dcf593,0x17050fc,0x3e3bde3,0x0a6c062,0x178202b,0x2f7674f,
-        0x0dadc29,0x15763a7,0x1d2daad,0x023d9f6 },
-      { 0x081ea5f,0x045959d,0x190c841,0x3a78d31,0x0e7d2dd,0x1414fea,
-        0x1d43f40,0x22d77ff,0x2b9c072,0x03e115c } },
-    /* 94 */
-    { { 0x3af71c9,0x29e9c65,0x25655e1,0x111e9cd,0x3a14494,0x3875418,
-        0x34ae070,0x0b06686,0x310616b,0x03b7b89 },
-      { 0x1734121,0x00d3d44,0x29f0b2f,0x1552897,0x31cac6e,0x1030bb3,
-        0x0148f3a,0x35fd237,0x29b44eb,0x027f49f } },
-    /* 95 */
-    { { 0x2e2cb16,0x1d962bd,0x19b63cc,0x0b3f964,0x3e3eb7d,0x1a35560,
-        0x0c58161,0x3ce1d6a,0x3b6958f,0x029030b },
-      { 0x2dcc158,0x3b1583f,0x30568c9,0x31957c8,0x27ad804,0x28c1f84,
-        0x3967049,0x37b3f64,0x3b87dc6,0x0266f26 } },
-    /* 96 */
-    { { 0x27dafc6,0x2548764,0x0d1984a,0x1a57027,0x252c1fb,0x24d9b77,
-        0x1581a0f,0x1f99276,0x10ba16d,0x026af88 },
-      { 0x0915220,0x2be1292,0x16c6480,0x1a93760,0x2fa7317,0x1a07296,
-        0x1539871,0x112c31f,0x25787f3,0x01e2070 } },
-    /* 97 */
-    { { 0x0bcf3ff,0x266d478,0x34f6933,0x31449fd,0x00d02cb,0x340765a,
-        0x3465a2d,0x225023e,0x319a30e,0x00579b8 },
-      { 0x20e05f4,0x35b834f,0x0404646,0x3710d62,0x3fad7bd,0x13e1434,
-        0x21c7d1c,0x1cb3af9,0x2cf1911,0x003957e } },
-    /* 98 */
-    { { 0x0787564,0x36601be,0x1ce67e9,0x084c7a1,0x21a3317,0x2067a35,
-        0x0158cab,0x195ddac,0x1766fe9,0x035cf42 },
-      { 0x2b7206e,0x20d0947,0x3b42424,0x03f1862,0x0a51929,0x38c2948,
-        0x0bb8595,0x2942d77,0x3748f15,0x0249428 } },
-    /* 99 */
-    { { 0x2577410,0x3c23e2f,0x28c6caf,0x00d41de,0x0fd408a,0x30298e9,
-        0x363289e,0x2302fc7,0x082c1cc,0x01dd050 },
-      { 0x30991cd,0x103e9ba,0x029605a,0x19927f7,0x0c1ca08,0x0c93f50,
-        0x28a3c7b,0x082e4e9,0x34d12eb,0x0232c13 } },
-    /* 100 */
-    { { 0x106171c,0x0b4155a,0x0c3fb1c,0x336c090,0x19073e9,0x2241a10,
-        0x0e6b4fd,0x0ed476e,0x1ef4712,0x039390a },
-      { 0x0ec36f4,0x3754f0e,0x2a270b8,0x007fd2d,0x0f9d2dc,0x1e6a692,
-        0x066e078,0x1954974,0x2ff3c6e,0x00def28 } },
-    /* 101 */
-    { { 0x3562470,0x0b8f1f7,0x0ac94cd,0x28b0259,0x244f272,0x031e4ef,
-        0x2d5df98,0x2c8a9f1,0x2dc3002,0x016644f },
-      { 0x350592a,0x0e6a0d5,0x1e027a1,0x2039e0f,0x399e01d,0x2817593,
-        0x0c0375e,0x3889b3e,0x24ab013,0x010de1b } },
-    /* 102 */
-    { { 0x256b5a6,0x0ac3b67,0x28f9ff3,0x29b67f1,0x30750d9,0x25e11a9,
-        0x15e8455,0x279ebb0,0x298b7e7,0x0218e32 },
-      { 0x2fc24b2,0x2b82582,0x28f22f5,0x2bd36b3,0x305398e,0x3b2e9e3,
-        0x365dd0a,0x29bc0ed,0x36a7b3a,0x007b374 } },
-    /* 103 */
-    { { 0x05ff2f3,0x2b3589b,0x29785d3,0x300a1ce,0x0a2d516,0x0844355,
-        0x14c9fad,0x3ccb6b6,0x385d459,0x0361743 },
-      { 0x0b11da3,0x002e344,0x18c49f7,0x0c29e0c,0x1d2c22c,0x08237b3,
-        0x2988f49,0x0f18955,0x1c3b4ed,0x02813c6 } },
-    /* 104 */
-    { { 0x17f93bd,0x249323b,0x11f6087,0x174e4bd,0x3cb64ac,0x086dc6b,
-        0x2e330a8,0x142c1f2,0x2ea5c09,0x024acbb },
-      { 0x1b6e235,0x3132521,0x00f085a,0x2a4a4db,0x1ab2ca4,0x0142224,
-        0x3aa6b3e,0x09db203,0x2215834,0x007b9e0 } },
-    /* 105 */
-    { { 0x23e79f7,0x28b8039,0x1906a60,0x2cbce67,0x1f590e7,0x181f027,
-        0x21054a6,0x3854240,0x2d857a6,0x03cfcb3 },
-      { 0x10d9b55,0x1443cfc,0x2648200,0x2b36190,0x09d2fcf,0x22f439f,
-        0x231aa7e,0x3884395,0x0543da3,0x003d5a9 } },
-    /* 106 */
-    { { 0x043e0df,0x06ffe84,0x3e6d5b2,0x3327001,0x26c74b6,0x12a145e,
-        0x256ec0d,0x3898c69,0x3411969,0x02f63c5 },
-      { 0x2b7494a,0x2eee1af,0x38388a9,0x1bd17ce,0x21567d4,0x13969e6,
-        0x3a12a7a,0x3e8277d,0x03530cc,0x00b4687 } },
-    /* 107 */
-    { { 0x06508da,0x38e04d4,0x15a7192,0x312875e,0x3336180,0x2a6512c,
-        0x1b59497,0x2e91b37,0x25eb91f,0x02841e9 },
-      { 0x394d639,0x0747143,0x37d7e6d,0x1d62962,0x08b4af3,0x34df287,
-        0x3c5584b,0x26bc869,0x20af87a,0x0060f5d } },
-    /* 108 */
-    { { 0x1de59a4,0x1a5c443,0x2f8729d,0x01c3a2f,0x0f1ad8d,0x3cbaf9e,
-        0x1b49634,0x35d508a,0x39dc269,0x0075105 },
-      { 0x390d30e,0x37033e0,0x110cb32,0x14c37a0,0x20a3b27,0x2f00ce6,
-        0x2f1dc52,0x34988c6,0x0c29606,0x01dc7e7 } },
-    /* 109 */
-    { { 0x1040739,0x24f9de1,0x2939999,0x2e6009a,0x244539d,0x17e3f09,
-        0x00f6f2f,0x1c63b3d,0x2310362,0x019109e },
-      { 0x1428aa8,0x3cb61e1,0x09a84f4,0x0ffafed,0x07b7adc,0x08f406b,
-        0x1b2c6df,0x035b480,0x3496ae9,0x012766d } },
-    /* 110 */
-    { { 0x35d1099,0x2362f10,0x1a08cc7,0x13a3a34,0x12adbcd,0x32da290,
-        0x02e2a02,0x151140b,0x01b3f60,0x0240df6 },
-      { 0x34c7b61,0x2eb09c1,0x172e7cd,0x2ad5eff,0x2fe2031,0x25b54d4,
-        0x0cec965,0x18e7187,0x26a7cc0,0x00230f7 } },
-    /* 111 */
-    { { 0x2d552ab,0x374083d,0x01f120f,0x2601736,0x156baff,0x04d44a4,
-        0x3b7c3e9,0x1acbc1b,0x0424579,0x031a425 },
-      { 0x1231bd1,0x0eba710,0x020517b,0x21d7316,0x21eac6e,0x275a848,
-        0x0837abf,0x0eb0082,0x302cafe,0x00fe8f6 } },
-    /* 112 */
-    { { 0x1058880,0x28f9941,0x03f2d75,0x3bd90e5,0x17da365,0x2ac9249,
-        0x07861cf,0x023fd05,0x1b0fdb8,0x031712f },
-      { 0x272b56b,0x04f8d2c,0x043a735,0x25446e4,0x1c8327e,0x221125a,
-        0x0ce37df,0x2dad7f6,0x39446c2,0x00b55b6 } },
-    /* 113 */
-    { { 0x346ac6b,0x05e0bff,0x2425246,0x0981e8b,0x1d19f79,0x2692378,
-        0x3ea3c40,0x2e90beb,0x19de503,0x003d5af },
-      { 0x05cda49,0x353b44d,0x299d137,0x3f205bc,0x2821158,0x3ad0d00,
-        0x06a54aa,0x2d7c79f,0x39d1173,0x01000ee } },
-    /* 114 */
-    { { 0x0803387,0x3a06268,0x14043b8,0x3d4e72f,0x1ece115,0x0a1dfc8,
-        0x17208dd,0x0be790a,0x122a07f,0x014dd95 },
-      { 0x0a4182d,0x202886a,0x1f79a49,0x1e8c867,0x0a2bbd0,0x28668b5,
-        0x0d0a2e1,0x115259d,0x3586c5d,0x01e815b } },
-    /* 115 */
-    { { 0x18a2a47,0x2c95627,0x2773646,0x1230f7c,0x15b5829,0x2fc354e,
-        0x2c000ea,0x099d547,0x2f17a1a,0x01df520 },
-      { 0x3853948,0x06f6561,0x3feeb8a,0x2f5b3ef,0x3a6f817,0x01a0791,
-        0x2ec0578,0x2c392ad,0x12b2b38,0x0104540 } },
-    /* 116 */
-    { { 0x1e28ced,0x0fc3d1b,0x2c473c7,0x1826c4f,0x21d5da7,0x39718e4,
-        0x38ce9e6,0x0251986,0x172fbea,0x0337c11 },
-      { 0x053c3b0,0x0f162db,0x043c1cb,0x04111ee,0x297fe3c,0x32e5e03,
-        0x2b8ae12,0x0c427ec,0x1da9738,0x03b9c0f } },
-    /* 117 */
-    { { 0x357e43a,0x054503f,0x11b8345,0x34ec6e0,0x2d44660,0x3d0ae61,
-        0x3b5dff8,0x33884ac,0x09da162,0x00a82b6 },
-      { 0x3c277ba,0x129a51a,0x027664e,0x1530507,0x0c788c9,0x2afd89d,
-        0x1aa64cc,0x1196450,0x367ac2b,0x0358b42 } },
-    /* 118 */
-    { { 0x0054ac4,0x1761ecb,0x378839c,0x167c9f7,0x2570058,0x0604a35,
-        0x37cbf3b,0x0909bb7,0x3f2991c,0x02ce688 },
-      { 0x0b16ae5,0x212857c,0x351b952,0x2c684db,0x30c6a05,0x09c01e0,
-        0x23c137f,0x1331475,0x092c067,0x0013b40 } },
-    /* 119 */
-    { { 0x2e90393,0x0617466,0x24e61f4,0x0a528f5,0x03047b4,0x2153f05,
-        0x0001a69,0x30e1eb8,0x3c10177,0x0282a47 },
-      { 0x22c831e,0x28fc06b,0x3e16ff0,0x208adc9,0x0bb76ae,0x28c1d6d,
-        0x12c8a15,0x031063c,0x1889ed2,0x002133e } },
-    /* 120 */
-    { { 0x0a6becf,0x14277bf,0x3328d98,0x201f7fe,0x12fceae,0x1de3a2e,
-        0x0a15c44,0x3ddf976,0x1b273ab,0x0355e55 },
-      { 0x1b5d4f1,0x369e78c,0x3a1c210,0x12cf3e9,0x3aa52f0,0x309f082,
-        0x112089d,0x107c753,0x24202d1,0x023853a } },
-    /* 121 */
-    { { 0x2897042,0x140d17c,0x2c4aeed,0x07d0d00,0x18d0533,0x22f7ec8,
-        0x19c194c,0x3456323,0x2372aa4,0x0165f86 },
-      { 0x30bd68c,0x1fb06b3,0x0945032,0x372ac09,0x06d4be0,0x27f8fa1,
-        0x1c8d7ac,0x137a96e,0x236199b,0x0328fc0 } },
-    /* 122 */
-    { { 0x170bd20,0x2842d58,0x1de7592,0x3c5b4fd,0x20ea897,0x12cab78,
-        0x363ff14,0x01f928c,0x17e309c,0x02f79ff },
-      { 0x0f5432c,0x2edb4ae,0x044b516,0x32f810d,0x2210dc1,0x23e56d6,
-        0x301e6ff,0x34660f6,0x10e0a7d,0x02d88eb } },
-    /* 123 */
-    { { 0x0c7b65b,0x2f59d58,0x2289a75,0x2408e92,0x1ab8c55,0x1ec99e5,
-        0x220fd0d,0x04defe0,0x24658ec,0x035aa8b },
-      { 0x138bb85,0x2f002d4,0x295c10a,0x08760ce,0x28c31d1,0x1c0a8cb,
-        0x0ff00b1,0x144eac9,0x2e02dcc,0x0044598 } },
-    /* 124 */
-    { { 0x3b42b87,0x050057b,0x0dff781,0x1c06db1,0x1bd9f5d,0x1f5f04a,
-        0x2cccd7a,0x143e19b,0x1cb94b7,0x036cfb8 },
-      { 0x34837cf,0x3cf6c3c,0x0d4fb26,0x22ee55e,0x1e7eed1,0x315995f,
-        0x2cdf937,0x1a96574,0x0425220,0x0221a99 } },
-    /* 125 */
-    { { 0x1b569ea,0x0d33ed9,0x19c13c2,0x107dc84,0x2200111,0x0569867,
-        0x2dc85da,0x05ef22e,0x0eb018a,0x029c33d },
-      { 0x04a6a65,0x3e5eba3,0x378f224,0x09c04d0,0x036e5cf,0x3df8258,
-        0x3a609e4,0x1eddef8,0x2abd174,0x02a91dc } },
-    /* 126 */
-    { { 0x2a60cc0,0x1d84c5e,0x115f676,0x1840da0,0x2c79163,0x2f06ed6,
-        0x198bb4b,0x3e5d37b,0x1dc30fa,0x018469b },
-      { 0x15ee47a,0x1e32f30,0x16a530e,0x2093836,0x02e8962,0x3767b62,
-        0x335adf3,0x27220db,0x2f81642,0x0173ffe } },
-    /* 127 */
-    { { 0x37a99cd,0x1533fe6,0x05a1c0d,0x27610f1,0x17bf3b9,0x0b1ce78,
-        0x0a908f6,0x265300e,0x3237dc1,0x01b969a },
-      { 0x3a5db77,0x2d15382,0x0d63ef8,0x1feb3d8,0x0b7b880,0x19820de,
-        0x11c0c67,0x2af3396,0x38d242d,0x0120688 } },
-    /* 128 */
-    { { 0x1d0b34a,0x05ef00d,0x00a7e34,0x1ae0c9f,0x1440b38,0x300d8b4,
-        0x37262da,0x3e50e3e,0x14ce0cd,0x00b1044 },
-      { 0x195a0b1,0x173bc6b,0x03622ba,0x2a19f55,0x1c09b37,0x07921b2,
-        0x16cdd20,0x24a5c9b,0x2bf42ff,0x00811de } },
-    /* 129 */
-    { { 0x0d65dbf,0x145cf06,0x1ad82f7,0x038ce7b,0x077bf94,0x33c4007,
-        0x22d26bd,0x25ad9c0,0x09ac773,0x02b1990 },
-      { 0x2261cc3,0x2ecdbf1,0x3e908b0,0x3246439,0x0213f7b,0x1179b04,
-        0x01cebaa,0x0be1595,0x175cc12,0x033a39a } },
-    /* 130 */
-    { { 0x00a67d2,0x086d06f,0x248a0f1,0x0291134,0x362d476,0x166d1cd,
-        0x044f1d6,0x2d2a038,0x365250b,0x0023f78 },
-      { 0x08bf287,0x3b0f6a1,0x1d6eace,0x20b4cda,0x2c2a621,0x0912520,
-        0x02dfdc9,0x1b35cd6,0x3d2565d,0x00bdf8b } },
-    /* 131 */
-    { { 0x3770fa7,0x2e4b6f0,0x03f9ae4,0x170de41,0x1095e8d,0x1dd845c,
-        0x334e9d1,0x00ab953,0x12e9077,0x03196fa },
-      { 0x2fd0a40,0x228c0fd,0x384b275,0x38ef339,0x3e7d822,0x3e5d9ef,
-        0x24f5854,0x0ece9eb,0x247d119,0x012ffe3 } },
-    /* 132 */
-    { { 0x0ff1480,0x07487c0,0x1b16cd4,0x1f41d53,0x22ab8fb,0x2f83cfa,
-        0x01d2efb,0x259f6b2,0x2e65772,0x00f9392 },
-      { 0x05303e6,0x23cdb4f,0x23977e1,0x12e4898,0x03bd999,0x0c930f0,
-        0x170e261,0x180a27b,0x2fd58ec,0x014e22b } },
-    /* 133 */
-    { { 0x25d7713,0x0c5fad7,0x09daad1,0x3b9d779,0x109b985,0x1d3ec98,
-        0x35bc4fc,0x2f838cb,0x0d14f75,0x0173e42 },
-      { 0x2657b12,0x10d4423,0x19e6760,0x296e5bb,0x2bfd421,0x25c3330,
-        0x29f51f8,0x0338838,0x24060f0,0x029a62e } },
-    /* 134 */
-    { { 0x3748fec,0x2c5a1bb,0x2cf973d,0x289fa74,0x3e6e755,0x38997bf,
-        0x0b6544c,0x2b6358c,0x38a7aeb,0x02c50bb },
-      { 0x3d5770a,0x06be7c5,0x012fad3,0x19cb2cd,0x266af3b,0x3ccd677,
-        0x160d1bd,0x141d5af,0x2965851,0x034625a } },
-    /* 135 */
-    { { 0x3c41c08,0x255eacc,0x22e1ec5,0x2b151a3,0x087de94,0x311cbdb,
-        0x016b73a,0x368e462,0x20b7981,0x0099ec3 },
-      { 0x262b988,0x1539763,0x21e76e5,0x15445b4,0x1d8ddc7,0x34a9be6,
-        0x10faf03,0x24e4d18,0x07aa111,0x02d538a } },
-    /* 136 */
-    { { 0x38a876b,0x048ad45,0x04b40a0,0x3fc2144,0x251ff96,0x13ca7dd,
-        0x0b31ab1,0x3539814,0x28b5f87,0x0212aec },
-      { 0x270790a,0x350e7e0,0x346bd5e,0x276178f,0x22d6cb5,0x3078884,
-        0x355c1b6,0x15901d7,0x3671765,0x03950db } },
-    /* 137 */
-    { { 0x286e8d5,0x2409788,0x13be53f,0x2d21911,0x0353c95,0x10238e8,
-        0x32f5bde,0x3a67b60,0x28b5b9c,0x001013d },
-      { 0x381e8e5,0x0cef7a9,0x2f5bcad,0x06058f0,0x33cdf50,0x04672a8,
-        0x1769600,0x31c055d,0x3df0ac1,0x00e9098 } },
-    /* 138 */
-    { { 0x2eb596d,0x197b326,0x12b4c29,0x39c08f2,0x101ea03,0x3804e58,
-        0x04b4b62,0x28d9d1c,0x13f905e,0x0032a3f },
-      { 0x11b2b61,0x08e9095,0x0d06925,0x270e43f,0x21eb7a8,0x0e4a98f,
-        0x31d2be0,0x030cf9f,0x2644ddb,0x025b728 } },
-    /* 139 */
-    { { 0x07510af,0x2ed0e8e,0x2a01203,0x2a2a68d,0x0846fea,0x3e540de,
-        0x3a57702,0x1677348,0x2123aad,0x010d8f8 },
-      { 0x0246a47,0x0e871d0,0x124dca4,0x34b9577,0x2b362b8,0x363ebe5,
-        0x3086045,0x26313e6,0x15cd8bb,0x0210384 } },
-    /* 140 */
-    { { 0x023e8a7,0x0817884,0x3a0bf12,0x3376371,0x3c808a8,0x18e9777,
-        0x12a2721,0x35b538a,0x2bd30de,0x017835a },
-      { 0x0fc0f64,0x1c8709f,0x2d8807a,0x0743957,0x242eec0,0x347e76c,
-        0x27bef91,0x289689a,0x0f42945,0x01f7a92 } },
-    /* 141 */
-    { { 0x1060a81,0x3dbc739,0x1615abd,0x1cbe3e5,0x3e79f9c,0x1ab09a2,
-        0x136c540,0x05b473f,0x2beebfd,0x02af0a8 },
-      { 0x3e2eac7,0x19be474,0x04668ac,0x18f4b74,0x36f10ba,0x0a0b4c6,
-        0x10e3770,0x3bf059e,0x3946c7e,0x013a8d4 } },
-    /* 142 */
-    { { 0x266309d,0x28be354,0x1a3eed8,0x3020651,0x10a51c6,0x1e31770,
-        0x0af45a5,0x3ff0f3b,0x2891c94,0x00e9db9 },
-      { 0x17b0d0f,0x33a291f,0x0a5f9aa,0x25a3d61,0x2963ace,0x39a5fef,
-        0x230c724,0x1919146,0x10a465e,0x02084a8 } },
-    /* 143 */
-    { { 0x3ab8caa,0x31870f3,0x2390ef7,0x2103850,0x218eb8e,0x3a5ccf2,
-        0x1dff677,0x2c59334,0x371599c,0x02a9f2a },
-      { 0x0837bd1,0x3249cef,0x35d702f,0x3430dab,0x1c06407,0x108f692,
-        0x221292f,0x05f0c5d,0x073fe06,0x01038e0 } },
-    /* 144 */
-    { { 0x3bf9b7c,0x2020929,0x30d0f4f,0x080fef8,0x3365d23,0x1f3e738,
-        0x3e53209,0x1549afe,0x300b305,0x038d811 },
-      { 0x0c6c2c7,0x2e6445b,0x3ee64dc,0x022e932,0x0726837,0x0deb67b,
-        0x1ed4346,0x3857f73,0x277a3de,0x01950b5 } },
-    /* 145 */
-    { { 0x36c377a,0x0adb41e,0x08be3f3,0x11e40d1,0x36cb038,0x036a2bd,
-        0x3dd3a82,0x1bc875b,0x2ee09bb,0x02994d2 },
-      { 0x035facf,0x05e0344,0x07e630a,0x0ce772d,0x335e55a,0x111fce4,
-        0x250fe1c,0x3bc89ba,0x32fdc9a,0x03cf2d9 } },
-    /* 146 */
-    { { 0x355fd83,0x1c67f8e,0x1d10eb3,0x1b21d77,0x0e0d7a4,0x173a9e1,
-        0x2c9fa90,0x1c39cce,0x22eaae8,0x01f2bea },
-      { 0x153b338,0x0534107,0x26c69b8,0x283be1f,0x3e0acc0,0x059cac3,
-        0x13d1081,0x148bbee,0x3c1b9bd,0x002aac4 } },
-    /* 147 */
-    { { 0x2681297,0x3389e34,0x146addc,0x2c6d425,0x2cb350e,0x1986abc,
-        0x0431737,0x04ba4b7,0x2028470,0x012e469 },
-      { 0x2f8ddcf,0x3c4255c,0x1af4dcf,0x07a6a44,0x208ebf6,0x0dc90c3,
-        0x34360ac,0x072ad23,0x0537232,0x01254d3 } },
-    /* 148 */
-    { { 0x07b7e9d,0x3df5c7c,0x116f83d,0x28c4f35,0x3a478ef,0x3011fb8,
-        0x2f264b6,0x317b9e3,0x04fd65a,0x032bd1b },
-      { 0x2aa8266,0x3431de4,0x04bba04,0x19a44da,0x0edf454,0x392c5ac,
-        0x265168a,0x1dc3d5b,0x25704c6,0x00533a7 } },
-    /* 149 */
-    { { 0x25e8f91,0x1178fa5,0x2492994,0x2eb2c3c,0x0d3aca1,0x0322828,
-        0x1cc70f9,0x269c74c,0x0a53e4c,0x006edc2 },
-      { 0x18bdd7a,0x2a79a55,0x26b1d5c,0x0200628,0x0734a05,0x3273c7b,
-        0x13aa714,0x0040ac2,0x2f2da30,0x03e7449 } },
-    /* 150 */
-    { { 0x3f9563e,0x2f29eab,0x14a0749,0x3fad264,0x1dd077a,0x3d7c59c,
-        0x3a0311b,0x331a789,0x0b9729e,0x0201ebf },
-      { 0x1b08b77,0x2a4cdf2,0x3e387f8,0x21510f1,0x286c3a7,0x1dbf62e,
-        0x3afa594,0x3363217,0x0d16568,0x01d46b7 } },
-    /* 151 */
-    { { 0x0715c0d,0x28e2d04,0x17f78ae,0x1c63dda,0x1d113ea,0x0fefc1b,
-        0x1eab149,0x1d0fd99,0x0682537,0x00a7b11 },
-      { 0x10bebbc,0x11c672d,0x14223d9,0x2ff9141,0x1399ee5,0x34b7b6c,
-        0x0d5b3a8,0x01df643,0x0e392a4,0x03fe4dc } },
-    /* 152 */
-    { { 0x2b75b65,0x0b5a6f1,0x11c559a,0x3549999,0x24188f8,0x37a75f4,
-        0x29f33e3,0x34068a2,0x38ba2a9,0x025dd91 },
-      { 0x29af2c7,0x0988b64,0x0923885,0x1b539a4,0x1334f5d,0x226947a,
-        0x2cc7e5a,0x20beb39,0x13fac2f,0x01d298c } },
-    /* 153 */
-    { { 0x35f079c,0x137f76d,0x2fbbb2f,0x254638d,0x185b07c,0x1f34db7,
-        0x2cfcf0e,0x218f46d,0x2150ff4,0x02add6f },
-      { 0x33fc9b7,0x0d9f005,0x0fd081b,0x0834965,0x2b90a74,0x102448d,
-        0x3dbf03c,0x167d857,0x02e0b44,0x013afab } },
-    /* 154 */
-    { { 0x09f2c53,0x317f9d7,0x1411eb6,0x0463aba,0x0d25220,0x256b176,
-        0x087633f,0x2bff322,0x07b2c1b,0x037e662 },
-      { 0x10aaecb,0x23bb4a1,0x2272bb7,0x06c075a,0x09d4918,0x0736f2b,
-        0x0dd511b,0x101625e,0x0a7779f,0x009ec10 } },
-    /* 155 */
-    { { 0x33b2eb2,0x0176dfd,0x2118904,0x022386c,0x2e0df85,0x2588c9f,
-        0x1b71525,0x28fd540,0x137e4cf,0x02ce4f7 },
-      { 0x3d75165,0x0c39ecf,0x3554a12,0x30af34c,0x2d66344,0x3ded408,
-        0x36f1be0,0x0d065b0,0x012d046,0x0025623 } },
-    /* 156 */
-    { { 0x2601c3b,0x1824fc0,0x335fe08,0x3e33d70,0x0fb0252,0x252bfca,
-        0x1cf2808,0x1922e55,0x1a9db9f,0x020721e },
-      { 0x2f56c51,0x39a1f31,0x218c040,0x1a4fc5d,0x3fed471,0x0164d4e,
-        0x388a419,0x06f1113,0x0f55fc1,0x03e8352 } },
-    /* 157 */
-    { { 0x1608e4d,0x3872778,0x022cbc6,0x044d60a,0x3010dda,0x15fb0b5,
-        0x37ddc11,0x19f5bda,0x156b6a3,0x023a838 },
-      { 0x383b3b4,0x1380bc8,0x353ca35,0x250fc07,0x169966b,0x3780f29,
-        0x36632b2,0x2d6b13f,0x124fa00,0x00fd6ae } },
-    /* 158 */
-    { { 0x1739efb,0x2ec3656,0x2c0d337,0x3d39faf,0x1c751b0,0x04699f4,
-        0x252dd64,0x095b8b6,0x0872b74,0x022f1da },
-      { 0x2d3d253,0x38edca0,0x379fa5b,0x287d635,0x3a9f679,0x059d9ee,
-        0x0ac168e,0x3cd3e87,0x19060fc,0x02ce1bc } },
-    /* 159 */
-    { { 0x3edcfc2,0x0f04d4b,0x2f0d31f,0x1898be2,0x25396bf,0x15ca230,
-        0x02b4eae,0x2713668,0x0f71b06,0x0132d18 },
-      { 0x38095ea,0x1ed34d6,0x3603ae6,0x165bf01,0x192bbf8,0x1852859,
-        0x075f66b,0x1488f85,0x10895ef,0x014b035 } },
-    /* 160 */
-    { { 0x1339848,0x3084385,0x0c8d231,0x3a1c1de,0x0e87a28,0x255b85c,
-        0x1de6616,0x2702e74,0x1382bb0,0x012b0f2 },
-      { 0x198987d,0x381545a,0x34d619b,0x312b827,0x18b2376,0x28fe4cf,
-        0x20b7651,0x017d077,0x0c7e397,0x00e0365 } },
-    /* 161 */
-    { { 0x1542e75,0x0d56aa0,0x39b701a,0x287b806,0x396c724,0x0935c21,
-        0x3a29776,0x0debdac,0x171de26,0x00b38f8 },
-      { 0x1d5bc1a,0x3fad27d,0x22b5cfe,0x1f89ddf,0x0a65560,0x144dd5b,
-        0x2aac2f9,0x139353f,0x0520b62,0x00b9b36 } },
-    /* 162 */
-    { { 0x031c31d,0x16552e3,0x1a0c368,0x0016fc8,0x168533d,0x171e7b2,
-        0x17626e7,0x275502f,0x14742c6,0x03285dd },
-      { 0x2d2dbb2,0x3b6bffd,0x1d18cc6,0x2f45d2a,0x0fd0d8c,0x2915e3a,
-        0x1e8793a,0x0b39a1d,0x3139cab,0x02a5da9 } },
-    /* 163 */
-    { { 0x3fb353d,0x147c6e4,0x3a720a6,0x22d5ff3,0x1d75cab,0x06c54a0,
-        0x08cfa73,0x12666aa,0x3170a1f,0x021c829 },
-      { 0x13e1b90,0x3a34dda,0x1fc38c3,0x02c5bdb,0x2d345dc,0x14aa1d0,
-        0x28d00ab,0x224f23a,0x329c769,0x025c67b } },
-    /* 164 */
-    { { 0x0e35909,0x3bb6356,0x0116820,0x370cf77,0x29366d8,0x3881409,
-        0x3999d06,0x013075f,0x176e157,0x02941ca },
-      { 0x0e70b2e,0x28dfab1,0x2a8a002,0x15da242,0x084dcf6,0x116ca97,
-        0x31bf186,0x1dc9735,0x09df7b7,0x0264e27 } },
-    /* 165 */
-    { { 0x2da7a4b,0x3023c9e,0x1366238,0x00ff4e2,0x03abe9d,0x19bd44b,
-        0x272e897,0x20b91ad,0x2aa202c,0x02a2201 },
-      { 0x380184e,0x08112b4,0x0b85660,0x31049aa,0x3a8cb78,0x36113c5,
-        0x1670c0a,0x373f9e7,0x3fb4738,0x00010ef } },
-    /* 166 */
-    { { 0x2d5192e,0x26d770d,0x32af8d5,0x34d1642,0x1acf885,0x05805e0,
-        0x166d0a1,0x1219a0d,0x301ba6c,0x014bcfb },
-      { 0x2dcb64d,0x19cca83,0x379f398,0x08e01a0,0x10a482c,0x0103cc2,
-        0x0be5fa7,0x1f9d45b,0x1899ef2,0x00ca5af } },
-    /* 167 */
-    { { 0x14d81d7,0x2aea251,0x1b3c476,0x3bd47ae,0x29eade7,0x0715e61,
-        0x1a21cd8,0x1c7a586,0x2bfaee5,0x00ee43f },
-      { 0x096f7cb,0x0c08f95,0x1bc4939,0x361fed4,0x255be41,0x26fad73,
-        0x31dd489,0x02c600f,0x29d9f81,0x01ba201 } },
-    /* 168 */
-    { { 0x03ea1db,0x1eac46d,0x1292ce3,0x2a54967,0x20a7ff1,0x3e13c61,
-        0x1b02218,0x2b44e14,0x3eadefa,0x029c88a },
-      { 0x30a9144,0x31e3b0a,0x19c5a2a,0x147cbe9,0x05a0240,0x051f38e,
-        0x11eca56,0x31a4247,0x123bc2a,0x02fa535 } },
-    /* 169 */
-    { { 0x3226ce7,0x1251782,0x0b7072f,0x11e59fa,0x2b8afd7,0x169b18f,
-        0x2a46f18,0x31d9bb7,0x2fe9be8,0x01de0b7 },
-      { 0x1b38626,0x34aa90f,0x3ad1760,0x21ddbd9,0x3460ae7,0x1126736,
-        0x1b86fc5,0x0b92cd0,0x167a289,0x000e0e1 } },
-    /* 170 */
-    { { 0x1ec1a0f,0x36bbf5e,0x1c972d8,0x3f73ace,0x13bbcd6,0x23d86a5,
-        0x175ffc5,0x2d083d5,0x2c4adf7,0x036f661 },
-      { 0x1f39eb7,0x2a20505,0x176c81a,0x3d6e636,0x16ee2fc,0x3cbdc5f,
-        0x25475dc,0x2ef4151,0x3c46860,0x0238934 } },
-    /* 171 */
-    { { 0x2587390,0x3639526,0x0588749,0x13c32fb,0x212bb19,0x09660f1,
-        0x207da4b,0x2bf211b,0x1c4407b,0x01506a6 },
-      { 0x24c8842,0x105a498,0x05ffdb2,0x0ab61b0,0x26044c1,0x3dff3d8,
-        0x1d14b44,0x0d74716,0x049f57d,0x030024b } },
-    /* 172 */
-    { { 0x32e61ef,0x31d70f7,0x35cad3c,0x320b86c,0x07e8841,0x027ca7d,
-        0x2d30d19,0x2513718,0x2347286,0x01d7901 },
-      { 0x3c237d0,0x107f16e,0x01c9e7d,0x3c3b13c,0x0c9537b,0x20af54d,
-        0x051a162,0x2161a47,0x258c784,0x016df2d } },
-    /* 173 */
-    { { 0x228ead1,0x29c2122,0x07f6964,0x023f4ed,0x1802dc5,0x19f96ce,
-        0x24bfd17,0x25e866b,0x2ba8df0,0x01eb84f },
-      { 0x2dd384e,0x05bbe3a,0x3f06fd2,0x366dacb,0x30361a2,0x2f36d7c,
-        0x0b98784,0x38ff481,0x074e2a8,0x01e1f60 } },
-    /* 174 */
-    { { 0x17fbb1c,0x0975add,0x1debc5e,0x2cb2880,0x3e47bdd,0x3488cff,
-        0x15e9a36,0x2121129,0x0199ef2,0x017088a },
-      { 0x0315250,0x352a162,0x17c1773,0x0ae09c2,0x321b21a,0x3bd74cf,
-        0x3c4ea1d,0x3cac2ad,0x3abbaf0,0x039174d } },
-    /* 175 */
-    { { 0x0511c8a,0x3c78d0a,0x2cd3d2d,0x322f729,0x3ebb229,0x09f0e69,
-        0x0a71a76,0x2e74d5e,0x12284df,0x03b5ef0 },
-      { 0x3dea561,0x0a9b7e4,0x0ed1cf2,0x237523c,0x05443f1,0x2eb48fa,
-        0x3861405,0x1b49f62,0x0c945ca,0x02ab25f } },
-    /* 176 */
-    { { 0x16bd00a,0x13a9d28,0x3cc1eb5,0x2b7d702,0x2d839e9,0x3e6ff01,
-        0x2bb7f11,0x3713824,0x3b31163,0x00c63e5 },
-      { 0x30d7138,0x0316fb0,0x0220ecc,0x08eaf0c,0x244e8df,0x0088d81,
-        0x37972fb,0x3fd34ae,0x2a19a84,0x03e907e } },
-    /* 177 */
-    { { 0x2642269,0x0b65d29,0x03bd440,0x33a6ede,0x3c81814,0x2507982,
-        0x0d38e47,0x3a788e6,0x32c1d26,0x00e2eda },
-      { 0x2577f87,0x392895a,0x3e1cc64,0x14f7047,0x08b52d2,0x08a01ca,
-        0x336abf6,0x00697fc,0x105ce76,0x0253742 } },
-    /* 178 */
-    { { 0x293f92a,0x33df737,0x3315156,0x32e26d7,0x0a01333,0x26579d4,
-        0x004df9c,0x0aba409,0x067d25c,0x02481de },
-      { 0x3f39d44,0x1c78042,0x13d7e24,0x0825aed,0x35f2c90,0x3270f63,
-        0x04b7b35,0x3ad4531,0x28bd29b,0x0207a10 } },
-    /* 179 */
-    { { 0x077199f,0x270aeb1,0x0dd96dd,0x3b9ad7b,0x28cb8ee,0x3903f43,
-        0x37db3fe,0x292c62b,0x362dbbf,0x006e52a },
-      { 0x247f143,0x0362cf3,0x216344f,0x3f18fd1,0x351e623,0x31664e0,
-        0x0f270fc,0x243bbc6,0x2280555,0x001a8e3 } },
-    /* 180 */
-    { { 0x3355b49,0x2c04e6c,0x399b2e5,0x182d3af,0x020e265,0x09a7cf7,
-        0x0ffa6bd,0x353e302,0x02083d9,0x029ecdb },
-      { 0x33e8830,0x0570e86,0x1c0b64d,0x386a27e,0x0d5fcea,0x0b45a4c,
-        0x2ee4a2e,0x0a8833f,0x2b4a282,0x02f9531 } },
-    /* 181 */
-    { { 0x191167c,0x36cf7e3,0x225ed6c,0x1e79e99,0x0517c3f,0x11ab1fd,
-        0x05648f3,0x08aedc4,0x1abeae0,0x02fcc29 },
-      { 0x3828a68,0x1e16fa4,0x30368e7,0x0c9fcfb,0x25161c3,0x24851ac,
-        0x1b5feb5,0x344eb84,0x0de2732,0x0347208 } },
-    /* 182 */
-    { { 0x038b363,0x384d1e4,0x2519043,0x151ac17,0x158c11f,0x009b2b4,
-        0x257abe6,0x2368d3f,0x3ed68a1,0x02df45e },
-      { 0x29c2559,0x2962478,0x3d8444c,0x1d96fff,0x04f7a03,0x1391a52,
-        0x0de4af7,0x3319126,0x15e6412,0x00e65ff } },
-    /* 183 */
-    { { 0x3d61507,0x1d1a0a2,0x0d2af20,0x354d299,0x329e132,0x2a28578,
-        0x2ddfb08,0x04fa3ff,0x1293c6c,0x003bae2 },
-      { 0x3e259f8,0x1a68fa9,0x3e67e9b,0x39b44f9,0x1ce1db7,0x347e9a1,
-        0x3318f6a,0x2dbbc9d,0x2f8c922,0x008a245 } },
-    /* 184 */
-    { { 0x212ab5b,0x2b896c2,0x0136959,0x07e55ef,0x0cc1117,0x05b8ac3,
-        0x18429ed,0x025fa01,0x11d6e93,0x03b016b },
-      { 0x03f3708,0x2e96fab,0x1d77157,0x0d4c2d6,0x131baf9,0x0608d39,
-        0x3552371,0x06cdd1e,0x1567ff1,0x01f4c50 } },
-    /* 185 */
-    { { 0x2dfefab,0x270173d,0x37077bd,0x1a372cd,0x1be2f22,0x28e2ee5,
-        0x3ead973,0x35e8f94,0x2fc9bc1,0x03a7399 },
-      { 0x36a02a1,0x2855d9b,0x00ed75a,0x37d8398,0x138c087,0x233706e,
-        0x147f346,0x01947e2,0x3017228,0x0365942 } },
-    /* 186 */
-    { { 0x2057e60,0x2d31296,0x25e4504,0x2fa37bc,0x1cbccc3,0x1f0732f,
-        0x3532081,0x2de8a98,0x19a804e,0x005359a },
-      { 0x31f411a,0x2a10576,0x369c2c8,0x02fe035,0x109fbaf,0x30bddeb,
-        0x1eef901,0x1662ad3,0x0410d43,0x01bd31a } },
-    /* 187 */
-    { { 0x2c24a96,0x1b7d3a5,0x19a3872,0x217f2f6,0x2534dbc,0x2cab8c2,
-        0x066ef28,0x26aecf1,0x0fd6118,0x01310d4 },
-      { 0x055b8da,0x1fdc5be,0x38a1296,0x25118f0,0x341a423,0x2ba4cd0,
-        0x3e1413e,0x062d70d,0x2425a31,0x029c9b4 } },
-    /* 188 */
-    { { 0x08c1086,0x1acfba5,0x22e1dae,0x0f72f4e,0x3f1de50,0x0f408bc,
-        0x35ed3f0,0x3ce48fc,0x282cc6c,0x004d8e7 },
-      { 0x1afaa86,0x24e3ef3,0x22589ac,0x3ec9952,0x1f45bc5,0x14144ca,
-        0x23b26e4,0x0d68c65,0x1e1c1a3,0x032a4d9 } },
-    /* 189 */
-    { { 0x03b2d20,0x16b1d53,0x241b361,0x05e4138,0x1742a54,0x32741c7,
-        0x0521c4c,0x1ca96c2,0x034970b,0x02738a7 },
-      { 0x13e0ad6,0x207dcdb,0x034c8cc,0x27bcbe1,0x18060da,0x33a18b6,
-        0x2d1d1a6,0x2be60d7,0x3d7ab42,0x012312a } },
-    /* 190 */
-    { { 0x0c7485a,0x06c3310,0x0dbfd22,0x2ef949d,0x0ead455,0x098f4ba,
-        0x3c76989,0x0cf2d24,0x032f67b,0x01e005f },
-      { 0x30cb5ee,0x0d5da64,0x0ed2b9d,0x2503102,0x1c0d14e,0x1cbc693,
-        0x37bf552,0x07013e2,0x054de5c,0x014f341 } },
-    /* 191 */
-    { { 0x128ccac,0x1617e97,0x346ebcd,0x158016d,0x25f823e,0x34048ea,
-        0x39f0a1c,0x3ea3df1,0x1c1d3d7,0x03ba919 },
-      { 0x151803b,0x01967c1,0x2f70781,0x27df39a,0x06c0b59,0x24a239c,
-        0x15a7702,0x2464d06,0x2a47ae6,0x006db90 } },
-    /* 192 */
-    { { 0x27d04c3,0x024df3d,0x38112e8,0x38a27ba,0x01e312b,0x0965358,
-        0x35d8879,0x2f4f55a,0x214187f,0x0008936 },
-      { 0x05fe36f,0x2ee18c3,0x1f5f87a,0x1813bd4,0x0580f3c,0x0ed0a7b,
-        0x0fb1bfb,0x3fcce59,0x2f042bf,0x01820e3 } },
-    /* 193 */
-    { { 0x20bbe99,0x32cbc9f,0x39ee432,0x3cc12a8,0x37bda44,0x3ea4e40,
-        0x097c7a9,0x0590d7d,0x2022d33,0x018dbac },
-      { 0x3ae00aa,0x3439864,0x2d2ffcf,0x3f8c6b9,0x0875a00,0x3e4e407,
-        0x3658a29,0x22eb3d0,0x2b63921,0x022113b } },
-    /* 194 */
-    { { 0x33bae58,0x05c749a,0x1f3e114,0x1c45f8e,0x27db3df,0x06a3ab6,
-        0x37bc7f8,0x1e27b34,0x3dc51fb,0x009eea0 },
-      { 0x3f54de5,0x3d0e7fe,0x1a71a7d,0x02ed7f8,0x0727703,0x2ca5e92,
-        0x2e8e35d,0x292ad0b,0x13487f3,0x02b6d8b } },
-    /* 195 */
-    { { 0x175df2a,0x05a28a8,0x32e99b1,0x13d8630,0x2082aa0,0x11ac245,
-        0x24f2e71,0x322cb27,0x17675e7,0x02e643f },
-      { 0x1f37313,0x2765ad3,0x0789082,0x1e742d0,0x11c2055,0x2021dc4,
-        0x09ae4a7,0x346359b,0x2f94d10,0x0205c1f } },
-    /* 196 */
-    { { 0x3d6ff96,0x1f2ac80,0x336097d,0x3f03610,0x35b851b,0x010b6d2,
-        0x0823c4d,0x2a9709a,0x2ead5a8,0x00de4b6 },
-      { 0x01afa0b,0x0621965,0x3671528,0x1050b60,0x3f3e9e7,0x2f93829,
-        0x0825275,0x006e85f,0x35e94b0,0x016af58 } },
-    /* 197 */
-    { { 0x2c4927c,0x3ea1382,0x0f23727,0x0d69f23,0x3e38860,0x2b72837,
-        0x3cd5ea4,0x2d84292,0x321846a,0x016656f },
-      { 0x29dfa33,0x3e182e0,0x018be90,0x2ba563f,0x2caafe2,0x218c0d9,
-        0x3baf447,0x1047a6c,0x0a2d483,0x01130cb } },
-    /* 198 */
-    { { 0x00ed80c,0x2a5fc79,0x0a82a74,0x2c4c74b,0x15f938c,0x30b5ab6,
-        0x32124b7,0x295314f,0x2fb8082,0x007c858 },
-      { 0x20b173e,0x19f315c,0x12f97e4,0x198217c,0x040e8a6,0x3275977,
-        0x2bc20e4,0x01f2633,0x02bc3e9,0x023c750 } },
-    /* 199 */
-    { { 0x3c4058a,0x24be73e,0x16704f5,0x2d8a4bd,0x3b15e14,0x3076315,
-        0x1cfe37b,0x36fe715,0x343926e,0x02c6603 },
-      { 0x2c76b09,0x0cf824c,0x3f7898c,0x274cec1,0x11df527,0x18eed18,
-        0x08ead48,0x23915bc,0x19b3744,0x00a0a2b } },
-    /* 200 */
-    { { 0x0cf4ac5,0x1c8b131,0x0afb696,0x0ff7799,0x2f5ac1a,0x022420c,
-        0x11baa2e,0x2ce4015,0x1275a14,0x0125cfc },
-      { 0x22eac5d,0x360cd4c,0x3568e59,0x3d42f66,0x35e07ee,0x09620e4,
-        0x36720fa,0x22b1eac,0x2d0db16,0x01b6b23 } },
-    /* 201 */
-    { { 0x1a835ef,0x1516bbb,0x2d51f7b,0x3487443,0x14aa113,0x0dd06c2,
-        0x1a65e01,0x379300d,0x35920b9,0x012c8fb },
-      { 0x04c7341,0x2eda00f,0x3c37e82,0x1b4fd62,0x0d45770,0x1478fba,
-        0x127863a,0x26939cd,0x134ddf4,0x01375c5 } },
-    /* 202 */
-    { { 0x1476cd9,0x1119ca5,0x325bbf9,0x0bf8c69,0x0648d07,0x312d9f8,
-        0x01c8b8f,0x136ec51,0x0002f4a,0x03f4c5c },
-      { 0x195d0e1,0x10ffd22,0x29aa1cb,0x3443bdc,0x276e695,0x05e6260,
-        0x15f9764,0x3cd9783,0x18c9569,0x0053eb1 } },
-    /* 203 */
-    { { 0x312ae18,0x280197c,0x3fc9ad9,0x303f324,0x251958d,0x29f4a11,
-        0x2142408,0x3694366,0x25136ab,0x03b5f1d },
-      { 0x1d4abbc,0x1c3c689,0x13ea462,0x3cfc684,0x39b5dd8,0x2d4654b,
-        0x09b0755,0x27d4f18,0x3f74d2e,0x03fbf2d } },
-    /* 204 */
-    { { 0x2119185,0x2525eae,0x1ba4bd0,0x0c2ab11,0x1d54e8c,0x294845e,
-        0x2479dea,0x3602d24,0x17e87e0,0x0060069 },
-      { 0x0afffb0,0x34fe37f,0x1240073,0x02eb895,0x06cf33c,0x2d7f7ef,
-        0x1d763b5,0x04191e0,0x11e1ead,0x027e3f0 } },
-    /* 205 */
-    { { 0x269544c,0x0e85c57,0x3813158,0x19fc12d,0x20eaf85,0x1e2930c,
-        0x22a8fd2,0x1a6a478,0x09d3d3a,0x02a74e0 },
-      { 0x1a2da3b,0x30b0b16,0x0847936,0x3d86257,0x138ccbc,0x0f5421a,
-        0x25244e6,0x23bdd79,0x1aee117,0x00c01ae } },
-    /* 206 */
-    { { 0x1eead28,0x07cac32,0x1fbc0bb,0x17627d3,0x17eef63,0x0b3a24e,
-        0x0757fdb,0x3dd841d,0x3d745f8,0x002ae17 },
-      { 0x25b4549,0x29f24cf,0x2f21ecd,0x1725e48,0x04be2bb,0x10ee010,
-        0x1a1274b,0x10b0898,0x27511e9,0x02c48b5 } },
-    /* 207 */
-    { { 0x2a5ae7a,0x181ef99,0x0be33be,0x3e9dab7,0x101e703,0x3adb971,
-        0x1043014,0x2ebb2be,0x1c1097d,0x027d667 },
-      { 0x3f250ed,0x16dc603,0x20dc6d7,0x1d0d268,0x38eb915,0x02c89e8,
-        0x1605a41,0x12de109,0x0e08a29,0x01f554a } },
-    /* 208 */
-    { { 0x0c26def,0x163d988,0x2d1ef0f,0x3a960ac,0x1025585,0x0738e20,
-        0x27d79b0,0x05cc3ef,0x201303f,0x00a333a },
-      { 0x1644ba5,0x2af345e,0x30b8d1d,0x3a01bff,0x31fc643,0x1acf85e,
-        0x0a76fc6,0x04efe98,0x348a1d0,0x03062eb } },
-    /* 209 */
-    { { 0x1c4216d,0x18e3217,0x02ac34e,0x19c8185,0x200c010,0x17d4192,
-        0x13a1719,0x165af51,0x09db7a9,0x0277be0 },
-      { 0x3ab8d2c,0x2190b99,0x22b641e,0x0cd88de,0x3b42404,0x1310862,
-        0x106a6d6,0x23395f5,0x0b06880,0x000d5fe } },
-    /* 210 */
-    { { 0x0d2cc88,0x36f9913,0x339d8e9,0x237c2e3,0x0cc61c2,0x34c2832,
-        0x309874c,0x2621d28,0x2dd1b48,0x0392806 },
-      { 0x17cd8f9,0x07bab3d,0x0c482ed,0x0faf565,0x31b767d,0x2f4bde1,
-        0x295c717,0x330c29c,0x179ce10,0x0119b5f } },
-    /* 211 */
-    { { 0x1ada2c7,0x0c624a7,0x227d47d,0x30e3e6a,0x14fa0a6,0x0829678,
-        0x24fd288,0x2b46a43,0x122451e,0x0319ca9 },
-      { 0x186b655,0x01f3217,0x0af1306,0x0efe6b5,0x2f0235d,0x1c45ca9,
-        0x2086805,0x1d44e66,0x0faf2a6,0x0178f59 } },
-    /* 212 */
-    { { 0x33b4416,0x10431e6,0x2d99aa6,0x217aac9,0x0cd8fcf,0x2d95a9d,
-        0x3ff74ad,0x10bf17a,0x295eb8e,0x01b229e },
-      { 0x02a63bd,0x182e9ec,0x004710c,0x00e2e3c,0x06b2f23,0x04b642c,
-        0x2c37383,0x32a4631,0x022ad82,0x00d22b9 } },
-    /* 213 */
-    { { 0x0cda2fb,0x1d198d7,0x26d27f4,0x286381c,0x022acca,0x24ac7c8,
-        0x2df7824,0x0b4ba16,0x1e0d9ef,0x03041d3 },
-      { 0x29a65b3,0x0f3912b,0x151bfcf,0x2b0175c,0x0fd71e4,0x39aa5e2,
-        0x311f50c,0x13ff351,0x3dbc9e5,0x03eeb7e } },
-    /* 214 */
-    { { 0x0a99363,0x0fc7348,0x2775171,0x23db3c8,0x2b91565,0x134d66c,
-        0x0175cd2,0x1bf365a,0x2b48371,0x02dfe5d },
-      { 0x16dbf74,0x2389357,0x2f36575,0x3f5c70e,0x38d23ba,0x090f7f8,
-        0x3477600,0x3201523,0x32ecafc,0x03d3506 } },
-    /* 215 */
-    { { 0x1abd48d,0x073ca3f,0x38a451f,0x0d8cb01,0x1ce81be,0x05c51ba,
-        0x0e29741,0x03c41ab,0x0eae016,0x0060209 },
-      { 0x2e58358,0x1da62d9,0x2358038,0x14b39b2,0x1635687,0x39079b1,
-        0x380e345,0x1b49608,0x23983cf,0x019f97d } },
-    /* 216 */
-    { { 0x34899ef,0x332e373,0x04c0f89,0x3c27aed,0x1949015,0x09663b2,
-        0x2f9276b,0x07f1951,0x09a04c1,0x027fbde },
-      { 0x3d2a071,0x19fb3d4,0x1b096d3,0x1fe9146,0x3b10e1a,0x0478bbb,
-        0x2b3fb06,0x1388329,0x181a99c,0x02f2030 } },
-    /* 217 */
-    { { 0x1eb82e6,0x14dbe39,0x3920972,0x31fd5b2,0x21a484f,0x02d7697,
-        0x0e21715,0x37c431e,0x2629f8c,0x01249c3 },
-      { 0x26b50ad,0x26deefa,0x0ffc1a3,0x30688e2,0x39a0284,0x041c65e,
-        0x03eb178,0x0bdfd50,0x2f96137,0x034bb94 } },
-    /* 218 */
-    { { 0x0e0362a,0x334a162,0x194dd37,0x29e3e97,0x2442fa8,0x10d2949,
-        0x3836e5a,0x2dccebf,0x0bee5ab,0x037ed1e },
-      { 0x33eede6,0x3c739d9,0x2f04a91,0x350ad6c,0x3a5390a,0x14c368b,
-        0x26f7bf5,0x11ce979,0x0b408df,0x0366850 } },
-    /* 219 */
-    { { 0x28ea498,0x0886d5b,0x2e090e0,0x0a4d58f,0x2623478,0x0d74ab7,
-        0x2b83913,0x12c6b81,0x18d623f,0x01d8301 },
-      { 0x198aa79,0x26d6330,0x3a7f0b8,0x34bc1ea,0x2f74890,0x378955a,
-        0x204110f,0x0102538,0x02d8f19,0x01c5066 } },
-    /* 220 */
-    { { 0x14b0f45,0x2838cd3,0x14e16f0,0x0e0e4aa,0x2d9280b,0x0f18757,
-        0x3324c6b,0x1391ceb,0x1ce89d5,0x00ebe74 },
-      { 0x0930371,0x3de6048,0x3097fd8,0x1308705,0x3eda266,0x3108c26,
-        0x1545dcd,0x1f7583a,0x1c37395,0x02c7e05 } },
-    /* 221 */
-    { { 0x1fec44a,0x2a9e3a2,0x0caf84f,0x11cf2a9,0x0c8c2ae,0x06da989,
-        0x1c807dc,0x3c149a4,0x1141543,0x02906bb },
-      { 0x15ffe04,0x0d4e65f,0x2e20424,0x37d896d,0x18bacb2,0x1e05ddd,
-        0x1660be8,0x183be17,0x1dd86fb,0x035ba70 } },
-    /* 222 */
-    { { 0x2853264,0x0ba5fb1,0x0a0b3aa,0x2df88c1,0x2771533,0x23aba6f,
-        0x112bb7b,0x3e3086e,0x210ae9b,0x027271b },
-      { 0x030b74c,0x0269678,0x1e90a23,0x135a98c,0x24ed749,0x126de7c,
-        0x344b23a,0x186da27,0x19640fa,0x0159af5 } },
-    /* 223 */
-    { { 0x18061f3,0x3004630,0x3c70066,0x34df20f,0x1190b25,0x1c9cc91,
-        0x1fc8e02,0x0d17bc1,0x390f525,0x033cb1c },
-      { 0x0eb30cf,0x2f3ad04,0x303aa09,0x2e835dd,0x1cfd2eb,0x143fc95,
-        0x02c43a1,0x025e7a1,0x3558aa2,0x000bd45 } },
-    /* 224 */
-    { { 0x1db7d07,0x3bde52b,0x1500396,0x1089115,0x20b4fc7,0x1e2a8f3,
-        0x3f8eacc,0x365f7eb,0x1a5e8d4,0x0053a6b },
-      { 0x37079e2,0x120284b,0x000edaa,0x33792c2,0x145baa3,0x20e055f,
-        0x365e2d7,0x26ba005,0x3ab8e9d,0x0282b53 } },
-    /* 225 */
-    { { 0x2653618,0x2dd8852,0x2a5f0bf,0x0f0c7aa,0x2187281,0x1252757,
-        0x13e7374,0x3b47855,0x0b86e56,0x02f354c },
-      { 0x2e9c47b,0x2fa14cc,0x19ab169,0x3fad401,0x0dc2776,0x24afeed,
-        0x3a97611,0x0d07736,0x3cf6979,0x02424a0 } },
-    /* 226 */
-    { { 0x2e81a13,0x000c91d,0x123967b,0x265885c,0x29bee1a,0x0cb8675,
-        0x2d361bd,0x1526823,0x3c9ace1,0x00d7bad },
-      { 0x24e5bdc,0x02b969f,0x2c6e128,0x34edb3b,0x12dcd2c,0x3899af0,
-        0x24224c6,0x3a1914b,0x0f4448a,0x026a2cb } },
-    /* 227 */
-    { { 0x1d03b59,0x1c6fc82,0x32abf64,0x28ed96b,0x1c90e62,0x2f57bb2,
-        0x3ff168e,0x04de7fd,0x0f4d449,0x01af6d8 },
-      { 0x255bc30,0x2bfaf22,0x3fe0dad,0x0584025,0x1c79ead,0x3078ef7,
-        0x2197414,0x022a50b,0x0fd94ba,0x0007b0f } },
-    /* 228 */
-    { { 0x09485c2,0x09dfaf7,0x10c7ba6,0x1e48bec,0x248cc9a,0x028a362,
-        0x21d60f7,0x193d93d,0x1c04754,0x0346b2c },
-      { 0x2f36612,0x240ac49,0x0d8bd26,0x13b8186,0x259c3a4,0x020d5fb,
-        0x38a8133,0x09b0937,0x39d4056,0x01f7341 } },
-    /* 229 */
-    { { 0x05a4b48,0x1f534fc,0x07725ce,0x148dc8c,0x2adcd29,0x04aa456,
-        0x0f79718,0x066e346,0x189377d,0x002fd4d },
-      { 0x068ea73,0x336569b,0x184d35e,0x32a08e9,0x3c7f3bb,0x11ce9c8,
-        0x3674c6f,0x21bf27e,0x0d9e166,0x034a2f9 } },
-    /* 230 */
-    { { 0x0fa8e4b,0x2e6418e,0x18fc5d2,0x1ba24ff,0x0559f18,0x0dbedbf,
-        0x2de2aa4,0x22338e9,0x3aa510f,0x035d801 },
-      { 0x23a4988,0x02aad94,0x02732d1,0x111d374,0x0b455cf,0x0d01c9e,
-        0x067082a,0x2ec05fd,0x368b303,0x03cad4b } },
-    /* 231 */
-    { { 0x035b4ca,0x1fabea6,0x1cbc0d5,0x3f2ed9a,0x02d2232,0x1990c66,
-        0x2eb680c,0x3b4ea3b,0x18ecc5a,0x03636fa },
-      { 0x1a02709,0x26f8ff1,0x1fa8cba,0x397d6e8,0x230be68,0x043aa14,
-        0x3d43cdf,0x25c17fa,0x3a3ee55,0x0380564 } },
-    /* 232 */
-    { { 0x275a0a6,0x16bd43a,0x0033d3e,0x2b15e16,0x2512226,0x005d901,
-        0x26d50fd,0x3bc19bf,0x3b1aeb8,0x02bfb01 },
-      { 0x0bb0a31,0x26559e0,0x1aae7fb,0x330dcc2,0x16f1af3,0x06afce2,
-        0x13a15a0,0x2ff7645,0x3546e2d,0x029c6e4 } },
-    /* 233 */
-    { { 0x0f593d2,0x384b806,0x122bbf8,0x0a281e0,0x1d1a904,0x2e93cab,
-        0x0505db0,0x08f6454,0x05c6285,0x014e880 },
-      { 0x3f2b935,0x22d8e79,0x161a07c,0x16b060a,0x02bff97,0x146328b,
-        0x3ceea77,0x238f61a,0x19b3d58,0x02fd1f4 } },
-    /* 234 */
-    { { 0x17665d5,0x259e9f7,0x0de5672,0x15cbcbd,0x34e3030,0x035240f,
-        0x0005ae8,0x286d851,0x07f39c9,0x000070b },
-      { 0x1efc6d6,0x2a0051a,0x2724143,0x2a9ef1e,0x0c810bd,0x1e05429,
-        0x25670ba,0x2e66d7d,0x0e786ff,0x03f6b7e } },
-    /* 235 */
-    { { 0x3c00785,0x232e23f,0x2b67fd3,0x244ed23,0x077fa75,0x3cda3ef,
-        0x14d055b,0x0f25011,0x24d5aa4,0x00ea0e3 },
-      { 0x297bb9a,0x198ca4f,0x14d9561,0x18d1076,0x39eb933,0x2b6caa0,
-        0x1591a60,0x0768d45,0x257873e,0x00f36e0 } },
-    /* 236 */
-    { { 0x1e77eab,0x0502a5f,0x0109137,0x0350592,0x3f7e1c5,0x3ac7437,
-        0x2dcad2c,0x1fee9d8,0x089f1f5,0x0169833 },
-      { 0x0d45673,0x0d8e090,0x065580b,0x065644f,0x11b82be,0x3592dd0,
-        0x3284b8d,0x23f0015,0x16fdbfd,0x0248bfd } },
-    /* 237 */
-    { { 0x1a129a1,0x1977bb2,0x0e041b2,0x15f30a1,0x0a5b1ce,0x3afef8f,
-        0x380c46c,0x3358810,0x27df6c5,0x01ca466 },
-      { 0x3b90f9a,0x3d14ea3,0x031b298,0x02e2390,0x2d719c0,0x25bc615,
-        0x2c0e777,0x0226b8c,0x3803624,0x0179e45 } },
-    /* 238 */
-    { { 0x363cdfb,0x1bb155f,0x24fd5c1,0x1c7c72b,0x28e6a35,0x18165f2,
-        0x226bea5,0x0beaff3,0x371e24c,0x0138294 },
-      { 0x1765357,0x29034e9,0x22b4276,0x11035ce,0x23c89af,0x074468c,
-        0x3370ae4,0x013bae3,0x018d566,0x03d7fde } },
-    /* 239 */
-    { { 0x209df21,0x0f8ff86,0x0e47fbf,0x23b99ba,0x126d5d2,0x2722405,
-        0x16bd0a2,0x1799082,0x0e9533f,0x039077c },
-      { 0x3ba9e3f,0x3f6902c,0x1895305,0x3ac9813,0x3f2340c,0x3c0d9f1,
-        0x26e1927,0x0557c21,0x16eac4f,0x023b75f } },
-    /* 240 */
-    { { 0x3fc8ff3,0x0770382,0x342fc9a,0x0afa4db,0x314efd8,0x328e07b,
-        0x016f7cc,0x3ba599c,0x1caed8a,0x0050cb0 },
-      { 0x0b23c26,0x2120a5c,0x3273ec6,0x1cc1cd6,0x2a64fe8,0x2bbc3d6,
-        0x09f6e5e,0x34b1b8e,0x00b5ac8,0x032bbd2 } },
-    /* 241 */
-    { { 0x1315922,0x1725e1d,0x0ca5524,0x1c4c18f,0x3d82951,0x193bcb2,
-        0x0e60d0b,0x388dbcf,0x37e8efa,0x0342e85 },
-      { 0x1b3af60,0x26ba3ec,0x220e53a,0x394f4b6,0x01a796a,0x3e7bbca,
-        0x163605d,0x2b85807,0x17c1c54,0x03cc725 } },
-    /* 242 */
-    { { 0x1cc4597,0x1635492,0x2028c0f,0x2c2eb82,0x2dc5015,0x0d2a052,
-        0x05fc557,0x1f0ebbf,0x0cb96e1,0x0004d01 },
-      { 0x1a824bf,0x3896172,0x2ed7b29,0x178007a,0x0d59318,0x07bda2b,
-        0x2ee6826,0x0f9b235,0x04b9193,0x01bcddf } },
-    /* 243 */
-    { { 0x0333fd2,0x0eeb46a,0x15b89f9,0x00968aa,0x2a89302,0x2bdd6b3,
-        0x1e5037e,0x2541884,0x24ed2d0,0x01b6e8f },
-      { 0x04399cd,0x3be6334,0x3adea48,0x1bb9adc,0x31811c6,0x05fb2bc,
-        0x360752c,0x3d29dcb,0x3423bec,0x03c4f3c } },
-    /* 244 */
-    { { 0x119e2eb,0x2e7b02a,0x0f68cee,0x257d8b0,0x183a9a1,0x2ae88a6,
-        0x3a3bb67,0x2eb4f3e,0x1a9274b,0x0320fea },
-      { 0x2fa1ce0,0x346c2d8,0x2fbf0d7,0x3d4d063,0x0e58b60,0x09c1bc1,
-        0x28ef9e5,0x09a0efe,0x0f45d70,0x02d275c } },
-    /* 245 */
-    { { 0x2d5513b,0x31d443e,0x1e2d914,0x3b2c5d4,0x105f32e,0x27ee756,
-        0x050418d,0x3c73db6,0x1bb0c30,0x01673eb },
-      { 0x1cb7fd6,0x1eb08d5,0x26a3e16,0x2e20810,0x0249367,0x029e219,
-        0x2ec58c9,0x12d9fab,0x362354a,0x016eafc } },
-    /* 246 */
-    { { 0x2424865,0x260747b,0x177f37c,0x1e3cb95,0x08b0028,0x2783016,
-        0x2970f1b,0x323c1c0,0x2a79026,0x0186231 },
-      { 0x0f244da,0x26866f4,0x087306f,0x173ec20,0x31ecced,0x3c84d8d,
-        0x070f9b9,0x2e764d5,0x075df50,0x0264ff9 } },
-    /* 247 */
-    { { 0x32c3609,0x0c737e6,0x14ea68e,0x300b11b,0x184eb19,0x29dd440,
-        0x09ec1a9,0x185adeb,0x0664c80,0x0207dd9 },
-      { 0x1fbe978,0x30a969d,0x33561d7,0x34fc60e,0x36743fe,0x00774af,
-        0x0d1f045,0x018360e,0x12a5fe9,0x01592a0 } },
-    /* 248 */
-    { { 0x2817d1d,0x2993d3e,0x2e0f7a5,0x112faa0,0x255f968,0x355fe6a,
-        0x3f5a0fc,0x075b2d7,0x3cf00e5,0x0089afc },
-      { 0x32833cf,0x06a7e4b,0x09a8d6d,0x1693d3e,0x320a0a3,0x3cfdfdd,
-        0x136c498,0x1e0d845,0x347ff25,0x01a1de7 } },
-    /* 249 */
-    { { 0x3043d08,0x030705c,0x20fa79b,0x1d07f00,0x0a54467,0x29b49b4,
-        0x367e289,0x0b82f4d,0x0d1eb09,0x025ef2c },
-      { 0x32ed3c3,0x1baaa3c,0x3c482ab,0x146ca06,0x3c8a4f1,0x3e85e3c,
-        0x1bf4f3b,0x1195534,0x3e80a78,0x02a1cbf } },
-    /* 250 */
-    { { 0x32b2086,0x2de4d68,0x3486b1a,0x03a0583,0x2e1eb71,0x2dab9af,
-        0x10cd913,0x28daa6f,0x3fcb732,0x000a04a },
-      { 0x3605318,0x3f5f2b3,0x2d1da63,0x143f7f5,0x1646e5d,0x040b586,
-        0x1683982,0x25abe87,0x0c9fe53,0x001ce47 } },
-    /* 251 */
-    { { 0x380d02b,0x055fc22,0x3f7fc50,0x3458a1d,0x26b8333,0x23550ab,
-        0x0a1af87,0x0a821eb,0x2dc7e6d,0x00d574a },
-      { 0x07386e1,0x3ccd68a,0x3275b41,0x253e390,0x2fd272a,0x1e6627a,
-        0x2ca2cde,0x0e9e4a1,0x1e37c2a,0x00f70ac } },
-    /* 252 */
-    { { 0x0581352,0x2748701,0x02bed68,0x094dd9e,0x30a00c8,0x3fb5c07,
-        0x3bd5909,0x211ac80,0x1103ccd,0x0311e1a },
-      { 0x0c768ed,0x29dc209,0x36575db,0x009a107,0x272feea,0x2b33383,
-        0x313ed56,0x134c9cc,0x168d5bb,0x033310a } },
-    /* 253 */
-    { { 0x17620b9,0x143784f,0x256a94e,0x229664a,0x1d89a5c,0x1d521f2,
-        0x0076406,0x1c73f70,0x342aa48,0x03851fa },
-      { 0x0f3ae46,0x2ad3bab,0x0fbe274,0x3ed40d4,0x2fd4936,0x232103a,
-        0x2afe474,0x25b8f7c,0x047080e,0x008e6b0 } },
-    /* 254 */
-    { { 0x3fee8d4,0x347cd4a,0x0fec481,0x33fe9ec,0x0ce80b5,0x33a6bcf,
-        0x1c4c9e2,0x3967441,0x1a3f5f7,0x03157e8 },
-      { 0x257c227,0x1bc53a0,0x200b318,0x0fcd0af,0x2c5b165,0x2a413ec,
-        0x2fc998a,0x2da6426,0x19cd4f4,0x0025336 } },
-    /* 255 */
-    { { 0x303beba,0x2072135,0x32918a9,0x140cb3a,0x08631d1,0x0ef527b,
-        0x05f2c9e,0x2b4ce91,0x0b642ab,0x02e428c },
-      { 0x0a5abf9,0x15013ed,0x3603b46,0x30dd76d,0x3004750,0x28d7627,
-        0x1a42ccc,0x093ddbe,0x39a1b79,0x00067e2 } },
-};
-
-/* Multiply the base point of P256 by the scalar and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * Stripe implementation.
- * Pre-generated: 2^0, 2^32, ...
- * Pre-generated: products of all combinations of above.
- * 8 doubles and adds (with qz=1)
- *
- * r     Resulting point.
- * k     Scalar to multiply by.
- * map   Indicates whether to convert result to affine.
- * ct    Constant time required.
- * heap  Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-static int sp_256_ecc_mulmod_base_10(sp_point_256* r, const sp_digit* k,
-        int map, int ct, void* heap)
-{
-    return sp_256_ecc_mulmod_stripe_10(r, &p256_base, p256_table,
-                                      k, map, ct, heap);
-}
-
-#endif
-
-/* Multiply the base point of P256 by the scalar and return the result.
- * If map is true then convert result to affine coordinates.
- *
- * km    Scalar to multiply by.
- * r     Resulting point.
- * map   Indicates whether to convert result to affine.
- * heap  Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-int sp_ecc_mulmod_base_256(const mp_int* km, ecc_point* r, int map, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* point = NULL;
-    sp_digit* k = NULL;
-#else
-    sp_point_256  point[1];
-    sp_digit k[10];
-#endif
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
-                                         DYNAMIC_TYPE_ECC);
-    if (point == NULL)
-        err = MEMORY_E;
-    if (err == MP_OKAY) {
-        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 10, heap,
-                               DYNAMIC_TYPE_ECC);
-        if (k == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        sp_256_from_mp(k, 10, km);
-
-            err = sp_256_ecc_mulmod_base_10(point, k, map, 1, heap);
-    }
-    if (err == MP_OKAY) {
-        err = sp_256_point_to_ecc_point_10(point, r);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (k != NULL)
-        XFREE(k, heap, DYNAMIC_TYPE_ECC);
-    if (point != NULL)
-        XFREE(point, heap, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
-}
-
-/* Multiply the base point of P256 by the scalar, add point a and return
- * the result. If map is true then convert result to affine coordinates.
- *
- * km      Scalar to multiply by.
- * am      Point to add to scalar mulitply result.
- * inMont  Point to add is in montogmery form.
- * r       Resulting point.
- * map     Indicates whether to convert result to affine.
- * heap    Heap to use for allocation.
- * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am,
-        int inMont, ecc_point* r, int map, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* point = NULL;
-    sp_digit* k = NULL;
-#else
-    sp_point_256 point[2];
-    sp_digit k[10 + 10 * 2 * 5];
-#endif
-    sp_point_256* addP = NULL;
-    sp_digit* tmp = NULL;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, 
-                                         DYNAMIC_TYPE_ECC);
-    if (point == NULL)
-        err = MEMORY_E;
-    if (err == MP_OKAY) {
-        k = (sp_digit*)XMALLOC(
-            sizeof(sp_digit) * (10 + 10 * 2 * 5),
-            heap, DYNAMIC_TYPE_ECC);
-        if (k == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        addP = point + 1;
-        tmp = k + 10;
-
-        sp_256_from_mp(k, 10, km);
-        sp_256_point_from_ecc_point_10(addP, am);
-    }
-    if ((err == MP_OKAY) && (!inMont)) {
-        err = sp_256_mod_mul_norm_10(addP->x, addP->x, p256_mod);
-    }
-    if ((err == MP_OKAY) && (!inMont)) {
-        err = sp_256_mod_mul_norm_10(addP->y, addP->y, p256_mod);
-    }
-    if ((err == MP_OKAY) && (!inMont)) {
-        err = sp_256_mod_mul_norm_10(addP->z, addP->z, p256_mod);
-    }
-    if (err == MP_OKAY) {
-            err = sp_256_ecc_mulmod_base_10(point, k, 0, 0, heap);
-    }
-    if (err == MP_OKAY) {
-            sp_256_proj_point_add_10(point, point, addP, tmp);
-
-        if (map) {
-                sp_256_map_10(point, point, tmp);
-        }
-
-        err = sp_256_point_to_ecc_point_10(point, r);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (k != NULL)
-        XFREE(k, heap, DYNAMIC_TYPE_ECC);
-    if (point)
-        XFREE(point, heap, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
-}
-
-#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
-                                                        defined(HAVE_ECC_VERIFY)
-/* Returns 1 if the number of zero.
- * Implementation is constant time.
- *
- * a  Number to check.
- * returns 1 if the number is zero and 0 otherwise.
- */
-static int sp_256_iszero_10(const sp_digit* a)
-{
-    return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7] |
-            a[8] | a[9]) == 0;
-}
-
-#endif /* WOLFSSL_VALIDATE_ECC_KEYGEN | HAVE_ECC_SIGN | HAVE_ECC_VERIFY */
-/* Add 1 to a. (a = a + 1)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_256_add_one_10(sp_digit* a)
-{
-    a[0]++;
-    sp_256_norm_10(a);
-}
-
-/* Read big endian unsigned byte array into r.
- *
- * r  A single precision integer.
- * size  Maximum number of bytes to convert
- * a  Byte array.
- * n  Number of bytes in array to read.
- */
-static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n)
-{
-    int i;
-    int j = 0;
-    word32 s = 0;
-
-    r[0] = 0;
-    for (i = n-1; i >= 0; i--) {
-        r[j] |= (((sp_digit)a[i]) << s);
-        if (s >= 18U) {
-            r[j] &= 0x3ffffff;
-            s = 26U - s;
-            if (j + 1 >= size) {
-                break;
-            }
-            r[++j] = (sp_digit)a[i] >> s;
-            s = 8U - s;
-        }
-        else {
-            s += 8U;
-        }
-    }
-
-    for (j++; j < size; j++) {
-        r[j] = 0;
-    }
-}
-
-/* Generates a scalar that is in the range 1..order-1.
- *
- * rng  Random number generator.
- * k    Scalar value.
- * returns RNG failures, MEMORY_E when memory allocation fails and
- * MP_OKAY on success.
- */
-static int sp_256_ecc_gen_k_10(WC_RNG* rng, sp_digit* k)
-{
-    int err;
-    byte buf[32];
-
-    do {
-        err = wc_RNG_GenerateBlock(rng, buf, sizeof(buf));
-        if (err == 0) {
-            sp_256_from_bin(k, 10, buf, (int)sizeof(buf));
-            if (sp_256_cmp_10(k, p256_order2) < 0) {
-                sp_256_add_one_10(k);
-                break;
-            }
-        }
-    }
-    while (err == 0);
-
-    return err;
-}
-
-/* Makes a random EC key pair.
- *
- * rng   Random number generator.
- * priv  Generated private value.
- * pub   Generated public point.
- * heap  Heap to use for allocation.
- * returns ECC_INF_E when the point does not have the correct order, RNG
- * failures, MEMORY_E when memory allocation fails and MP_OKAY on success.
- */
-int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* point = NULL;
-    sp_digit* k = NULL;
-#else
-    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
-    sp_point_256 point[2];
-    #else
-    sp_point_256 point[1];
-    #endif
-    sp_digit k[10];
-#endif
-#ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
-    sp_point_256* infinity = NULL;
-#endif
-    int err = MP_OKAY;
-    
-
-    (void)heap;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
-    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC);
-    #else
-    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC);    
-    #endif
-    if (point == NULL)
-        err = MEMORY_E;
-    if (err == MP_OKAY) {
-        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 10, heap,
-                               DYNAMIC_TYPE_ECC);
-        if (k == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
-        infinity = point + 1;
-    #endif
-
-        err = sp_256_ecc_gen_k_10(rng, k);
-    }
-    if (err == MP_OKAY) {
-            err = sp_256_ecc_mulmod_base_10(point, k, 1, 1, NULL);
-    }
-
-#ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
-    if (err == MP_OKAY) {
-            err = sp_256_ecc_mulmod_10(infinity, point, p256_order, 1, 1, NULL);
-    }
-    if (err == MP_OKAY) {
-        if (sp_256_iszero_10(point->x) || sp_256_iszero_10(point->y)) {
-            err = ECC_INF_E;
-        }
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        err = sp_256_to_mp(k, priv);
-    }
-    if (err == MP_OKAY) {
-        err = sp_256_point_to_ecc_point_10(point, pub);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (k != NULL)
-        XFREE(k, heap, DYNAMIC_TYPE_ECC);
-    if (point != NULL) {
-        /* point is not sensitive, so no need to zeroize */
-        XFREE(point, heap, DYNAMIC_TYPE_ECC);
-    }
-#endif
-
-    return err;
-}
-
-#ifdef HAVE_ECC_DHE
 /* Write r as big endian to byte array.
- * Fixed length number of bytes written: 32
+ * Fixed length number of bytes written: 512
  *
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_256_to_bin(sp_digit* r, byte* a)
+static void sp_4096_to_bin_162(sp_digit* r, byte* a)
 {
     int i;
     int j;
     int s = 0;
     int b;
 
-    for (i=0; i<9; i++) {
+    for (i=0; i<161; i++) {
         r[i+1] += r[i] >> 26;
         r[i] &= 0x3ffffff;
     }
-    j = 256 / 8 - 1;
+    j = 4096 / 8 - 1;
     a[j] = 0;
-    for (i=0; i<10 && j>=0; i++) {
+    for (i=0; i<162 && j>=0; i++) {
         b = 0;
         /* lint allow cast of mismatch sp_digit and int */
         a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
@@ -17540,123 +15910,1342 @@ static void sp_256_to_bin(sp_digit* r, byte* a)
     }
 }
 
-/* Multiply the point by the scalar and serialize the X ordinate.
- * The number is 0 padded to maximum size on output.
+#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
+/* Normalize the values in each word to 26 bits.
  *
- * priv    Scalar to multiply the point by.
- * pub     Point to multiply.
- * out     Buffer to hold X ordinate.
- * outLen  On entry, size of the buffer in bytes.
- *         On exit, length of data in buffer in bytes.
- * heap    Heap to use for allocation.
- * returns BUFFER_E if the buffer is to small for output size,
- * MEMORY_E when memory allocation fails and MP_OKAY on success.
+ * a  Array of sp_digit to normalize.
  */
-int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
-                          word32* outLen, void* heap)
+static void sp_4096_norm_81(sp_digit* a)
 {
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_point_256* point = NULL;
-    sp_digit* k = NULL;
-#else
-    sp_point_256 point[1];
-    sp_digit k[10];
-#endif
-    int err = MP_OKAY;
-
-    if (*outLen < 32U) {
-        err = BUFFER_E;
+    int i;
+    for (i = 0; i < 80; i += 8) {
+        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
+        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
+        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
+        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
+        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
+        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
+        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
+        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
     }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (err == MP_OKAY) {
-        point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
-                                         DYNAMIC_TYPE_ECC);
-        if (point == NULL)
-            err = MEMORY_E;
-    }
-    if (err == MP_OKAY) {
-        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 10, heap,
-                               DYNAMIC_TYPE_ECC);
-        if (k == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        sp_256_from_mp(k, 10, priv);
-        sp_256_point_from_ecc_point_10(point, pub);
-            err = sp_256_ecc_mulmod_10(point, point, k, 1, 1, heap);
-    }
-    if (err == MP_OKAY) {
-        sp_256_to_bin(point->x, out);
-        *outLen = 32;
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (k != NULL)
-        XFREE(k, heap, DYNAMIC_TYPE_ECC);
-    if (point != NULL)
-        XFREE(point, heap, DYNAMIC_TYPE_ECC);
-#endif
-
-    return err;
 }
-#endif /* HAVE_ECC_DHE */
 
-#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
-#endif
-#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
+#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
+/* Normalize the values in each word to 26 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_79(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 72; i += 8) {
+        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
+        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
+        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
+        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
+        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
+        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
+        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
+        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
+    }
+    a[73] += a[72] >> 26; a[72] &= 0x3ffffff;
+    a[74] += a[73] >> 26; a[73] &= 0x3ffffff;
+    a[75] += a[74] >> 26; a[74] &= 0x3ffffff;
+    a[76] += a[75] >> 26; a[75] &= 0x3ffffff;
+    a[77] += a[76] >> 26; a[76] &= 0x3ffffff;
+    a[78] += a[77] >> 26; a[77] &= 0x3ffffff;
+}
+
+/* Normalize the values in each word to 26 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_162(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 160; i += 8) {
+        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
+        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
+        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
+        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
+        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
+        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
+        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
+        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
+    }
+    a[161] += a[160] >> 26; a[160] &= 0x3ffffff;
+}
+
+/* Normalize the values in each word to 26 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_158(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 152; i += 8) {
+        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
+        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
+        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
+        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
+        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
+        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
+        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
+        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
+    }
+    a[153] += a[152] >> 26; a[152] &= 0x3ffffff;
+    a[154] += a[153] >> 26; a[153] &= 0x3ffffff;
+    a[155] += a[154] >> 26; a[154] &= 0x3ffffff;
+    a[156] += a[155] >> 26; a[155] &= 0x3ffffff;
+    a[157] += a[156] >> 26; a[156] &= 0x3ffffff;
+}
+
+#ifndef WOLFSSL_SP_SMALL
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_mul_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_uint64 t0   = ((sp_uint64)a[ 0]) * b[ 0];
+    sp_uint64 t1   = ((sp_uint64)a[ 0]) * b[ 1]
+                 + ((sp_uint64)a[ 1]) * b[ 0];
+    sp_uint64 t2   = ((sp_uint64)a[ 0]) * b[ 2]
+                 + ((sp_uint64)a[ 1]) * b[ 1]
+                 + ((sp_uint64)a[ 2]) * b[ 0];
+    sp_uint64 t3   = ((sp_uint64)a[ 0]) * b[ 3]
+                 + ((sp_uint64)a[ 1]) * b[ 2]
+                 + ((sp_uint64)a[ 2]) * b[ 1]
+                 + ((sp_uint64)a[ 3]) * b[ 0];
+    sp_uint64 t4   = ((sp_uint64)a[ 0]) * b[ 4]
+                 + ((sp_uint64)a[ 1]) * b[ 3]
+                 + ((sp_uint64)a[ 2]) * b[ 2]
+                 + ((sp_uint64)a[ 3]) * b[ 1]
+                 + ((sp_uint64)a[ 4]) * b[ 0];
+    sp_uint64 t5   = ((sp_uint64)a[ 0]) * b[ 5]
+                 + ((sp_uint64)a[ 1]) * b[ 4]
+                 + ((sp_uint64)a[ 2]) * b[ 3]
+                 + ((sp_uint64)a[ 3]) * b[ 2]
+                 + ((sp_uint64)a[ 4]) * b[ 1]
+                 + ((sp_uint64)a[ 5]) * b[ 0];
+    sp_uint64 t6   = ((sp_uint64)a[ 0]) * b[ 6]
+                 + ((sp_uint64)a[ 1]) * b[ 5]
+                 + ((sp_uint64)a[ 2]) * b[ 4]
+                 + ((sp_uint64)a[ 3]) * b[ 3]
+                 + ((sp_uint64)a[ 4]) * b[ 2]
+                 + ((sp_uint64)a[ 5]) * b[ 1]
+                 + ((sp_uint64)a[ 6]) * b[ 0];
+    sp_uint64 t7   = ((sp_uint64)a[ 0]) * b[ 7]
+                 + ((sp_uint64)a[ 1]) * b[ 6]
+                 + ((sp_uint64)a[ 2]) * b[ 5]
+                 + ((sp_uint64)a[ 3]) * b[ 4]
+                 + ((sp_uint64)a[ 4]) * b[ 3]
+                 + ((sp_uint64)a[ 5]) * b[ 2]
+                 + ((sp_uint64)a[ 6]) * b[ 1]
+                 + ((sp_uint64)a[ 7]) * b[ 0];
+    sp_uint64 t8   = ((sp_uint64)a[ 0]) * b[ 8]
+                 + ((sp_uint64)a[ 1]) * b[ 7]
+                 + ((sp_uint64)a[ 2]) * b[ 6]
+                 + ((sp_uint64)a[ 3]) * b[ 5]
+                 + ((sp_uint64)a[ 4]) * b[ 4]
+                 + ((sp_uint64)a[ 5]) * b[ 3]
+                 + ((sp_uint64)a[ 6]) * b[ 2]
+                 + ((sp_uint64)a[ 7]) * b[ 1]
+                 + ((sp_uint64)a[ 8]) * b[ 0];
+    sp_uint64 t9   = ((sp_uint64)a[ 1]) * b[ 8]
+                 + ((sp_uint64)a[ 2]) * b[ 7]
+                 + ((sp_uint64)a[ 3]) * b[ 6]
+                 + ((sp_uint64)a[ 4]) * b[ 5]
+                 + ((sp_uint64)a[ 5]) * b[ 4]
+                 + ((sp_uint64)a[ 6]) * b[ 3]
+                 + ((sp_uint64)a[ 7]) * b[ 2]
+                 + ((sp_uint64)a[ 8]) * b[ 1];
+    sp_uint64 t10  = ((sp_uint64)a[ 2]) * b[ 8]
+                 + ((sp_uint64)a[ 3]) * b[ 7]
+                 + ((sp_uint64)a[ 4]) * b[ 6]
+                 + ((sp_uint64)a[ 5]) * b[ 5]
+                 + ((sp_uint64)a[ 6]) * b[ 4]
+                 + ((sp_uint64)a[ 7]) * b[ 3]
+                 + ((sp_uint64)a[ 8]) * b[ 2];
+    sp_uint64 t11  = ((sp_uint64)a[ 3]) * b[ 8]
+                 + ((sp_uint64)a[ 4]) * b[ 7]
+                 + ((sp_uint64)a[ 5]) * b[ 6]
+                 + ((sp_uint64)a[ 6]) * b[ 5]
+                 + ((sp_uint64)a[ 7]) * b[ 4]
+                 + ((sp_uint64)a[ 8]) * b[ 3];
+    sp_uint64 t12  = ((sp_uint64)a[ 4]) * b[ 8]
+                 + ((sp_uint64)a[ 5]) * b[ 7]
+                 + ((sp_uint64)a[ 6]) * b[ 6]
+                 + ((sp_uint64)a[ 7]) * b[ 5]
+                 + ((sp_uint64)a[ 8]) * b[ 4];
+    sp_uint64 t13  = ((sp_uint64)a[ 5]) * b[ 8]
+                 + ((sp_uint64)a[ 6]) * b[ 7]
+                 + ((sp_uint64)a[ 7]) * b[ 6]
+                 + ((sp_uint64)a[ 8]) * b[ 5];
+    sp_uint64 t14  = ((sp_uint64)a[ 6]) * b[ 8]
+                 + ((sp_uint64)a[ 7]) * b[ 7]
+                 + ((sp_uint64)a[ 8]) * b[ 6];
+    sp_uint64 t15  = ((sp_uint64)a[ 7]) * b[ 8]
+                 + ((sp_uint64)a[ 8]) * b[ 7];
+    sp_uint64 t16  = ((sp_uint64)a[ 8]) * b[ 8];
+
+    t1   += t0  >> 26; r[ 0] = t0  & 0x3ffffff;
+    t2   += t1  >> 26; r[ 1] = t1  & 0x3ffffff;
+    t3   += t2  >> 26; r[ 2] = t2  & 0x3ffffff;
+    t4   += t3  >> 26; r[ 3] = t3  & 0x3ffffff;
+    t5   += t4  >> 26; r[ 4] = t4  & 0x3ffffff;
+    t6   += t5  >> 26; r[ 5] = t5  & 0x3ffffff;
+    t7   += t6  >> 26; r[ 6] = t6  & 0x3ffffff;
+    t8   += t7  >> 26; r[ 7] = t7  & 0x3ffffff;
+    t9   += t8  >> 26; r[ 8] = t8  & 0x3ffffff;
+    t10  += t9  >> 26; r[ 9] = t9  & 0x3ffffff;
+    t11  += t10 >> 26; r[10] = t10 & 0x3ffffff;
+    t12  += t11 >> 26; r[11] = t11 & 0x3ffffff;
+    t13  += t12 >> 26; r[12] = t12 & 0x3ffffff;
+    t14  += t13 >> 26; r[13] = t13 & 0x3ffffff;
+    t15  += t14 >> 26; r[14] = t14 & 0x3ffffff;
+    t16  += t15 >> 26; r[15] = t15 & 0x3ffffff;
+    r[17] = (sp_digit)(t16 >> 26);
+                       r[16] = t16 & 0x3ffffff;
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_sqr_9(sp_digit* r, const sp_digit* a)
+{
+    sp_uint64 t0   =  ((sp_uint64)a[ 0]) * a[ 0];
+    sp_uint64 t1   = (((sp_uint64)a[ 0]) * a[ 1]) * 2;
+    sp_uint64 t2   = (((sp_uint64)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_uint64)a[ 1]) * a[ 1];
+    sp_uint64 t3   = (((sp_uint64)a[ 0]) * a[ 3]
+                 +  ((sp_uint64)a[ 1]) * a[ 2]) * 2;
+    sp_uint64 t4   = (((sp_uint64)a[ 0]) * a[ 4]
+                 +  ((sp_uint64)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_uint64)a[ 2]) * a[ 2];
+    sp_uint64 t5   = (((sp_uint64)a[ 0]) * a[ 5]
+                 +  ((sp_uint64)a[ 1]) * a[ 4]
+                 +  ((sp_uint64)a[ 2]) * a[ 3]) * 2;
+    sp_uint64 t6   = (((sp_uint64)a[ 0]) * a[ 6]
+                 +  ((sp_uint64)a[ 1]) * a[ 5]
+                 +  ((sp_uint64)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_uint64)a[ 3]) * a[ 3];
+    sp_uint64 t7   = (((sp_uint64)a[ 0]) * a[ 7]
+                 +  ((sp_uint64)a[ 1]) * a[ 6]
+                 +  ((sp_uint64)a[ 2]) * a[ 5]
+                 +  ((sp_uint64)a[ 3]) * a[ 4]) * 2;
+    sp_uint64 t8   = (((sp_uint64)a[ 0]) * a[ 8]
+                 +  ((sp_uint64)a[ 1]) * a[ 7]
+                 +  ((sp_uint64)a[ 2]) * a[ 6]
+                 +  ((sp_uint64)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_uint64)a[ 4]) * a[ 4];
+    sp_uint64 t9   = (((sp_uint64)a[ 1]) * a[ 8]
+                 +  ((sp_uint64)a[ 2]) * a[ 7]
+                 +  ((sp_uint64)a[ 3]) * a[ 6]
+                 +  ((sp_uint64)a[ 4]) * a[ 5]) * 2;
+    sp_uint64 t10  = (((sp_uint64)a[ 2]) * a[ 8]
+                 +  ((sp_uint64)a[ 3]) * a[ 7]
+                 +  ((sp_uint64)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_uint64)a[ 5]) * a[ 5];
+    sp_uint64 t11  = (((sp_uint64)a[ 3]) * a[ 8]
+                 +  ((sp_uint64)a[ 4]) * a[ 7]
+                 +  ((sp_uint64)a[ 5]) * a[ 6]) * 2;
+    sp_uint64 t12  = (((sp_uint64)a[ 4]) * a[ 8]
+                 +  ((sp_uint64)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_uint64)a[ 6]) * a[ 6];
+    sp_uint64 t13  = (((sp_uint64)a[ 5]) * a[ 8]
+                 +  ((sp_uint64)a[ 6]) * a[ 7]) * 2;
+    sp_uint64 t14  = (((sp_uint64)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_uint64)a[ 7]) * a[ 7];
+    sp_uint64 t15  = (((sp_uint64)a[ 7]) * a[ 8]) * 2;
+    sp_uint64 t16  =  ((sp_uint64)a[ 8]) * a[ 8];
+
+    t1   += t0  >> 26; r[ 0] = t0  & 0x3ffffff;
+    t2   += t1  >> 26; r[ 1] = t1  & 0x3ffffff;
+    t3   += t2  >> 26; r[ 2] = t2  & 0x3ffffff;
+    t4   += t3  >> 26; r[ 3] = t3  & 0x3ffffff;
+    t5   += t4  >> 26; r[ 4] = t4  & 0x3ffffff;
+    t6   += t5  >> 26; r[ 5] = t5  & 0x3ffffff;
+    t7   += t6  >> 26; r[ 6] = t6  & 0x3ffffff;
+    t8   += t7  >> 26; r[ 7] = t7  & 0x3ffffff;
+    t9   += t8  >> 26; r[ 8] = t8  & 0x3ffffff;
+    t10  += t9  >> 26; r[ 9] = t9  & 0x3ffffff;
+    t11  += t10 >> 26; r[10] = t10 & 0x3ffffff;
+    t12  += t11 >> 26; r[11] = t11 & 0x3ffffff;
+    t13  += t12 >> 26; r[12] = t12 & 0x3ffffff;
+    t14  += t13 >> 26; r[13] = t13 & 0x3ffffff;
+    t15  += t14 >> 26; r[14] = t14 & 0x3ffffff;
+    t16  += t15 >> 26; r[15] = t15 & 0x3ffffff;
+    r[17] = (sp_digit)(t16 >> 26);
+                       r[16] = t16 & 0x3ffffff;
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    r[ 0] = a[ 0] + b[ 0];
+    r[ 1] = a[ 1] + b[ 1];
+    r[ 2] = a[ 2] + b[ 2];
+    r[ 3] = a[ 3] + b[ 3];
+    r[ 4] = a[ 4] + b[ 4];
+    r[ 5] = a[ 5] + b[ 5];
+    r[ 6] = a[ 6] + b[ 6];
+    r[ 7] = a[ 7] + b[ 7];
+    r[ 8] = a[ 8] + b[ 8];
+
+    return 0;
+}
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_sub_18(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 8) {
+        r[i + 0] = a[i + 0] - b[i + 0];
+        r[i + 1] = a[i + 1] - b[i + 1];
+        r[i + 2] = a[i + 2] - b[i + 2];
+        r[i + 3] = a[i + 3] - b[i + 3];
+        r[i + 4] = a[i + 4] - b[i + 4];
+        r[i + 5] = a[i + 5] - b[i + 5];
+        r[i + 6] = a[i + 6] - b[i + 6];
+        r[i + 7] = a[i + 7] - b[i + 7];
+    }
+    r[16] = a[16] - b[16];
+    r[17] = a[17] - b[17];
+
+    return 0;
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_18(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
+    }
+    r[16] = a[16] + b[16];
+    r[17] = a[17] + b[17];
+
+    return 0;
+}
+
+/* Normalize the values in each word to 26 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_9(sp_digit* a)
+{
+    a[1] += a[0] >> 26; a[0] &= 0x3ffffff;
+    a[2] += a[1] >> 26; a[1] &= 0x3ffffff;
+    a[3] += a[2] >> 26; a[2] &= 0x3ffffff;
+    a[4] += a[3] >> 26; a[3] &= 0x3ffffff;
+    a[5] += a[4] >> 26; a[4] &= 0x3ffffff;
+    a[6] += a[5] >> 26; a[5] &= 0x3ffffff;
+    a[7] += a[6] >> 26; a[6] &= 0x3ffffff;
+    a[8] += a[7] >> 26; a[7] &= 0x3ffffff;
+}
+
+/* Normalize the values in each word to 26 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_18(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 16; i += 8) {
+        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
+        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
+        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
+        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
+        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
+        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
+        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
+        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
+    }
+    a[17] += a[16] >> 26; a[16] &= 0x3ffffff;
+}
+
+/* Normalize the values in each word to 26 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_54(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 48; i += 8) {
+        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
+        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
+        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
+        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
+        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
+        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
+        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
+        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
+    }
+    a[49] += a[48] >> 26; a[48] &= 0x3ffffff;
+    a[50] += a[49] >> 26; a[49] &= 0x3ffffff;
+    a[51] += a[50] >> 26; a[50] &= 0x3ffffff;
+    a[52] += a[51] >> 26; a[51] &= 0x3ffffff;
+    a[53] += a[52] >> 26; a[52] &= 0x3ffffff;
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_mul_27(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_digit p0[18];
+    sp_digit p1[18];
+    sp_digit p2[18];
+    sp_digit p3[18];
+    sp_digit p4[18];
+    sp_digit p5[18];
+    sp_digit t0[18];
+    sp_digit t1[18];
+    sp_digit t2[18];
+    sp_digit a0[9];
+    sp_digit a1[9];
+    sp_digit a2[9];
+    sp_digit b0[9];
+    sp_digit b1[9];
+    sp_digit b2[9];
+    (void)sp_4096_add_9(a0, a, &a[9]);
+    sp_4096_norm_9(a0);
+    (void)sp_4096_add_9(b0, b, &b[9]);
+    sp_4096_norm_9(b0);
+    (void)sp_4096_add_9(a1, &a[9], &a[18]);
+    sp_4096_norm_9(a1);
+    (void)sp_4096_add_9(b1, &b[9], &b[18]);
+    sp_4096_norm_9(b1);
+    (void)sp_4096_add_9(a2, a0, &a[18]);
+    sp_4096_norm_9(a1);
+    (void)sp_4096_add_9(b2, b0, &b[18]);
+    sp_4096_norm_9(b2);
+    sp_4096_mul_9(p0, a, b);
+    sp_4096_mul_9(p2, &a[9], &b[9]);
+    sp_4096_mul_9(p4, &a[18], &b[18]);
+    sp_4096_mul_9(p1, a0, b0);
+    sp_4096_mul_9(p3, a1, b1);
+    sp_4096_mul_9(p5, a2, b2);
+    XMEMSET(r, 0, sizeof(*r)*2U*27U);
+    (void)sp_4096_sub_18(t0, p3, p2);
+    (void)sp_4096_sub_18(t1, p1, p2);
+    (void)sp_4096_sub_18(t2, p5, t0);
+    (void)sp_4096_sub_18(t2, t2, t1);
+    sp_4096_norm_18(t2);
+    (void)sp_4096_sub_18(t0, t0, p4);
+    sp_4096_norm_18(t0);
+    (void)sp_4096_sub_18(t1, t1, p0);
+    sp_4096_norm_18(t1);
+    (void)sp_4096_add_18(r, r, p0);
+    (void)sp_4096_add_18(&r[9], &r[9], t1);
+    (void)sp_4096_add_18(&r[18], &r[18], t2);
+    (void)sp_4096_add_18(&r[27], &r[27], t0);
+    (void)sp_4096_add_18(&r[36], &r[36], p4);
+    sp_4096_norm_54(r);
+}
+
+/* Square a into r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_sqr_27(sp_digit* r, const sp_digit* a)
+{
+    sp_digit p0[18];
+    sp_digit p1[18];
+    sp_digit p2[18];
+    sp_digit p3[18];
+    sp_digit p4[18];
+    sp_digit p5[18];
+    sp_digit t0[18];
+    sp_digit t1[18];
+    sp_digit t2[18];
+    sp_digit a0[9];
+    sp_digit a1[9];
+    sp_digit a2[9];
+    (void)sp_4096_add_9(a0, a, &a[9]);
+    sp_4096_norm_9(a0);
+    (void)sp_4096_add_9(a1, &a[9], &a[18]);
+    sp_4096_norm_9(a1);
+    (void)sp_4096_add_9(a2, a0, &a[18]);
+    sp_4096_norm_9(a2);
+    sp_4096_sqr_9(p0, a);
+    sp_4096_sqr_9(p2, &a[9]);
+    sp_4096_sqr_9(p4, &a[18]);
+    sp_4096_sqr_9(p1, a0);
+    sp_4096_sqr_9(p3, a1);
+    sp_4096_sqr_9(p5, a2);
+    XMEMSET(r, 0, sizeof(*r)*2U*27U);
+    (void)sp_4096_sub_18(t0, p3, p2);
+    (void)sp_4096_sub_18(t1, p1, p2);
+    (void)sp_4096_sub_18(t2, p5, t0);
+    (void)sp_4096_sub_18(t2, t2, t1);
+    sp_4096_norm_18(t2);
+    (void)sp_4096_sub_18(t0, t0, p4);
+    sp_4096_norm_18(t0);
+    (void)sp_4096_sub_18(t1, t1, p0);
+    sp_4096_norm_18(t1);
+    (void)sp_4096_add_18(r, r, p0);
+    (void)sp_4096_add_18(&r[9], &r[9], t1);
+    (void)sp_4096_add_18(&r[18], &r[18], t2);
+    (void)sp_4096_add_18(&r[27], &r[27], t0);
+    (void)sp_4096_add_18(&r[36], &r[36], p4);
+    sp_4096_norm_54(r);
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_27(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 24; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
+    }
+    r[24] = a[24] + b[24];
+    r[25] = a[25] + b[25];
+    r[26] = a[26] + b[26];
+
+    return 0;
+}
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_sub_54(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 48; i += 8) {
+        r[i + 0] = a[i + 0] - b[i + 0];
+        r[i + 1] = a[i + 1] - b[i + 1];
+        r[i + 2] = a[i + 2] - b[i + 2];
+        r[i + 3] = a[i + 3] - b[i + 3];
+        r[i + 4] = a[i + 4] - b[i + 4];
+        r[i + 5] = a[i + 5] - b[i + 5];
+        r[i + 6] = a[i + 6] - b[i + 6];
+        r[i + 7] = a[i + 7] - b[i + 7];
+    }
+    r[48] = a[48] - b[48];
+    r[49] = a[49] - b[49];
+    r[50] = a[50] - b[50];
+    r[51] = a[51] - b[51];
+    r[52] = a[52] - b[52];
+    r[53] = a[53] - b[53];
+
+    return 0;
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_54(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 48; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
+    }
+    r[48] = a[48] + b[48];
+    r[49] = a[49] + b[49];
+    r[50] = a[50] + b[50];
+    r[51] = a[51] + b[51];
+    r[52] = a[52] + b[52];
+    r[53] = a[53] + b[53];
+
+    return 0;
+}
+
+/* Normalize the values in each word to 26 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_27(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 24; i += 8) {
+        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
+        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
+        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
+        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
+        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
+        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
+        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
+        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
+    }
+    a[25] += a[24] >> 26; a[24] &= 0x3ffffff;
+    a[26] += a[25] >> 26; a[25] &= 0x3ffffff;
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_mul_81(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_digit p0[54];
+    sp_digit p1[54];
+    sp_digit p2[54];
+    sp_digit p3[54];
+    sp_digit p4[54];
+    sp_digit p5[54];
+    sp_digit t0[54];
+    sp_digit t1[54];
+    sp_digit t2[54];
+    sp_digit a0[27];
+    sp_digit a1[27];
+    sp_digit a2[27];
+    sp_digit b0[27];
+    sp_digit b1[27];
+    sp_digit b2[27];
+    (void)sp_4096_add_27(a0, a, &a[27]);
+    sp_4096_norm_27(a0);
+    (void)sp_4096_add_27(b0, b, &b[27]);
+    sp_4096_norm_27(b0);
+    (void)sp_4096_add_27(a1, &a[27], &a[54]);
+    sp_4096_norm_27(a1);
+    (void)sp_4096_add_27(b1, &b[27], &b[54]);
+    sp_4096_norm_27(b1);
+    (void)sp_4096_add_27(a2, a0, &a[54]);
+    sp_4096_norm_27(a1);
+    (void)sp_4096_add_27(b2, b0, &b[54]);
+    sp_4096_norm_27(b2);
+    sp_4096_mul_27(p0, a, b);
+    sp_4096_mul_27(p2, &a[27], &b[27]);
+    sp_4096_mul_27(p4, &a[54], &b[54]);
+    sp_4096_mul_27(p1, a0, b0);
+    sp_4096_mul_27(p3, a1, b1);
+    sp_4096_mul_27(p5, a2, b2);
+    XMEMSET(r, 0, sizeof(*r)*2U*81U);
+    (void)sp_4096_sub_54(t0, p3, p2);
+    (void)sp_4096_sub_54(t1, p1, p2);
+    (void)sp_4096_sub_54(t2, p5, t0);
+    (void)sp_4096_sub_54(t2, t2, t1);
+    sp_4096_norm_54(t2);
+    (void)sp_4096_sub_54(t0, t0, p4);
+    sp_4096_norm_54(t0);
+    (void)sp_4096_sub_54(t1, t1, p0);
+    sp_4096_norm_54(t1);
+    (void)sp_4096_add_54(r, r, p0);
+    (void)sp_4096_add_54(&r[27], &r[27], t1);
+    (void)sp_4096_add_54(&r[54], &r[54], t2);
+    (void)sp_4096_add_54(&r[81], &r[81], t0);
+    (void)sp_4096_add_54(&r[108], &r[108], p4);
+    sp_4096_norm_162(r);
+}
+
+/* Square a into r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_sqr_81(sp_digit* r, const sp_digit* a)
+{
+    sp_digit p0[54];
+    sp_digit p1[54];
+    sp_digit p2[54];
+    sp_digit p3[54];
+    sp_digit p4[54];
+    sp_digit p5[54];
+    sp_digit t0[54];
+    sp_digit t1[54];
+    sp_digit t2[54];
+    sp_digit a0[27];
+    sp_digit a1[27];
+    sp_digit a2[27];
+    (void)sp_4096_add_27(a0, a, &a[27]);
+    sp_4096_norm_27(a0);
+    (void)sp_4096_add_27(a1, &a[27], &a[54]);
+    sp_4096_norm_27(a1);
+    (void)sp_4096_add_27(a2, a0, &a[54]);
+    sp_4096_norm_27(a2);
+    sp_4096_sqr_27(p0, a);
+    sp_4096_sqr_27(p2, &a[27]);
+    sp_4096_sqr_27(p4, &a[54]);
+    sp_4096_sqr_27(p1, a0);
+    sp_4096_sqr_27(p3, a1);
+    sp_4096_sqr_27(p5, a2);
+    XMEMSET(r, 0, sizeof(*r)*2U*81U);
+    (void)sp_4096_sub_54(t0, p3, p2);
+    (void)sp_4096_sub_54(t1, p1, p2);
+    (void)sp_4096_sub_54(t2, p5, t0);
+    (void)sp_4096_sub_54(t2, t2, t1);
+    sp_4096_norm_54(t2);
+    (void)sp_4096_sub_54(t0, t0, p4);
+    sp_4096_norm_54(t0);
+    (void)sp_4096_sub_54(t1, t1, p0);
+    sp_4096_norm_54(t1);
+    (void)sp_4096_add_54(r, r, p0);
+    (void)sp_4096_add_54(&r[27], &r[27], t1);
+    (void)sp_4096_add_54(&r[54], &r[54], t2);
+    (void)sp_4096_add_54(&r[81], &r[81], t0);
+    (void)sp_4096_add_54(&r[108], &r[108], p4);
+    sp_4096_norm_162(r);
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_81(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 80; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
+    }
+    r[80] = a[80] + b[80];
+
+    return 0;
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_162(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 160; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
+    }
+    r[160] = a[160] + b[160];
+    r[161] = a[161] + b[161];
+
+    return 0;
+}
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_sub_162(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 160; i += 8) {
+        r[i + 0] = a[i + 0] - b[i + 0];
+        r[i + 1] = a[i + 1] - b[i + 1];
+        r[i + 2] = a[i + 2] - b[i + 2];
+        r[i + 3] = a[i + 3] - b[i + 3];
+        r[i + 4] = a[i + 4] - b[i + 4];
+        r[i + 5] = a[i + 5] - b[i + 5];
+        r[i + 6] = a[i + 6] - b[i + 6];
+        r[i + 7] = a[i + 7] - b[i + 7];
+    }
+    r[160] = a[160] - b[160];
+    r[161] = a[161] - b[161];
+
+    return 0;
+}
+
+/* Normalize the values in each word to 26 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_324(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 320; i += 8) {
+        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
+        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
+        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
+        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
+        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
+        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
+        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
+        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
+    }
+    a[321] += a[320] >> 26; a[320] &= 0x3ffffff;
+    a[322] += a[321] >> 26; a[321] &= 0x3ffffff;
+    a[323] += a[322] >> 26; a[322] &= 0x3ffffff;
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_mul_162(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_digit* z0 = r;
+    sp_digit z1[162];
+    sp_digit* a1 = z1;
+    sp_digit b1[81];
+    sp_digit* z2 = r + 162;
+    (void)sp_4096_add_81(a1, a, &a[81]);
+    sp_4096_norm_81(a1);
+    (void)sp_4096_add_81(b1, b, &b[81]);
+    sp_4096_norm_81(b1);
+    sp_4096_mul_81(z2, &a[81], &b[81]);
+    sp_4096_mul_81(z0, a, b);
+    sp_4096_mul_81(z1, a1, b1);
+    (void)sp_4096_sub_162(z1, z1, z2);
+    (void)sp_4096_sub_162(z1, z1, z0);
+    (void)sp_4096_add_162(r + 81, r + 81, z1);
+    sp_4096_norm_324(r);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_sqr_162(sp_digit* r, const sp_digit* a)
+{
+    sp_digit* z0 = r;
+    sp_digit z1[162];
+    sp_digit* a1 = z1;
+    sp_digit* z2 = r + 162;
+    (void)sp_4096_add_81(a1, a, &a[81]);
+    sp_4096_norm_81(a1);
+    sp_4096_sqr_81(z2, &a[81]);
+    sp_4096_sqr_81(z0, a);
+    sp_4096_sqr_81(z1, a1);
+    (void)sp_4096_sub_162(z1, z1, z2);
+    (void)sp_4096_sub_162(z1, z1, z0);
+    (void)sp_4096_add_162(r + 81, r + 81, z1);
+    sp_4096_norm_324(r);
+}
+
+#endif /* !WOLFSSL_SP_SMALL */
+/* Caclulate the bottom digit of -1/a mod 2^n.
+ *
+ * a    A single precision number.
+ * rho  Bottom word of inverse.
+ */
+static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
+{
+    sp_digit x;
+    sp_digit b;
+
+    b = a[0];
+    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+    x &= 0x3ffffff;
+
+    /* rho = -1/m mod b */
+    *rho = ((sp_digit)1 << 26) - x;
+}
+
 /* Multiply a by scalar b into r. (r = a * b)
  *
  * r  A single precision integer.
  * a  A single precision integer.
  * b  A scalar.
  */
-SP_NOINLINE static void sp_256_mul_d_10(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_4096_mul_d_162(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    sp_digit t2;
+    sp_int64 p[4];
     int i;
 
-    for (i = 0; i < 10; i++) {
+    for (i = 0; i < 160; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 3] = (sp_digit)t2;
+    }
+    t += tb * a[160];
+    r[160] = (sp_digit)(t & 0x3ffffff);
+    t >>= 26;
+    t += tb * a[161];
+    r[161] = (sp_digit)(t & 0x3ffffff);
+    t >>= 26;
+    r[162] = (sp_digit)(t & 0x3ffffff);
+}
+
+#if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
+#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_sub_81(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 80; i += 8) {
+        r[i + 0] = a[i + 0] - b[i + 0];
+        r[i + 1] = a[i + 1] - b[i + 1];
+        r[i + 2] = a[i + 2] - b[i + 2];
+        r[i + 3] = a[i + 3] - b[i + 3];
+        r[i + 4] = a[i + 4] - b[i + 4];
+        r[i + 5] = a[i + 5] - b[i + 5];
+        r[i + 6] = a[i + 6] - b[i + 6];
+        r[i + 7] = a[i + 7] - b[i + 7];
+    }
+    r[80] = a[80] - b[80];
+
+    return 0;
+}
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 4096 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_4096_mont_norm_81(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i = 0; i < 72; i += 8) {
+        r[i + 0] = 0x3ffffff;
+        r[i + 1] = 0x3ffffff;
+        r[i + 2] = 0x3ffffff;
+        r[i + 3] = 0x3ffffff;
+        r[i + 4] = 0x3ffffff;
+        r[i + 5] = 0x3ffffff;
+        r[i + 6] = 0x3ffffff;
+        r[i + 7] = 0x3ffffff;
+    }
+    r[72] = 0x3ffffff;
+    r[73] = 0x3ffffff;
+    r[74] = 0x3ffffff;
+    r[75] = 0x3ffffff;
+    r[76] = 0x3ffffff;
+    r[77] = 0x3ffffff;
+    r[78] = 0xfffffL;
+    r[79] = 0;
+    r[80] = 0;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_4096_sub_81(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_4096_cmp_81(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    r |= (a[80] - b[80]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    for (i = 72; i >= 0; i -= 8) {
+        r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 4] - b[i + 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 3] - b[i + 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 2] - b[i + 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_4096_cond_sub_81(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 80; i += 8) {
+        r[i + 0] = a[i + 0] - (b[i + 0] & m);
+        r[i + 1] = a[i + 1] - (b[i + 1] & m);
+        r[i + 2] = a[i + 2] - (b[i + 2] & m);
+        r[i + 3] = a[i + 3] - (b[i + 3] & m);
+        r[i + 4] = a[i + 4] - (b[i + 4] & m);
+        r[i + 5] = a[i + 5] - (b[i + 5] & m);
+        r[i + 6] = a[i + 6] - (b[i + 6] & m);
+        r[i + 7] = a[i + 7] - (b[i + 7] & m);
+    }
+    r[80] = a[80] - (b[80] & m);
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_add_81(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 81; i++) {
+        t += r[i];
         t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x3ffffff);
+        r[i] = ((sp_digit)t) & 0x3ffffff;
         t >>= 26;
     }
-    r[10] = (sp_digit)t;
+    r[81] += (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t[10];
+    sp_int64 tb = b;
+    sp_int64 t[8];
+    int i;
 
-    t[ 0] = tb * a[ 0];
-    t[ 1] = tb * a[ 1];
-    t[ 2] = tb * a[ 2];
-    t[ 3] = tb * a[ 3];
-    t[ 4] = tb * a[ 4];
-    t[ 5] = tb * a[ 5];
-    t[ 6] = tb * a[ 6];
-    t[ 7] = tb * a[ 7];
-    t[ 8] = tb * a[ 8];
-    t[ 9] = tb * a[ 9];
-    r[ 0] = (sp_digit)                 (t[ 0] & 0x3ffffff);
-    r[ 1] = (sp_digit)((t[ 0] >> 26) + (t[ 1] & 0x3ffffff));
-    r[ 2] = (sp_digit)((t[ 1] >> 26) + (t[ 2] & 0x3ffffff));
-    r[ 3] = (sp_digit)((t[ 2] >> 26) + (t[ 3] & 0x3ffffff));
-    r[ 4] = (sp_digit)((t[ 3] >> 26) + (t[ 4] & 0x3ffffff));
-    r[ 5] = (sp_digit)((t[ 4] >> 26) + (t[ 5] & 0x3ffffff));
-    r[ 6] = (sp_digit)((t[ 5] >> 26) + (t[ 6] & 0x3ffffff));
-    r[ 7] = (sp_digit)((t[ 6] >> 26) + (t[ 7] & 0x3ffffff));
-    r[ 8] = (sp_digit)((t[ 7] >> 26) + (t[ 8] & 0x3ffffff));
-    r[ 9] = (sp_digit)((t[ 8] >> 26) + (t[ 9] & 0x3ffffff));
-    r[10] = (sp_digit) (t[ 9] >> 26);
-#endif /* WOLFSSL_SP_SMALL */
+    t[0] = 0;
+    for (i = 0; i < 80; i += 8) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        t[4]  = (tb * a[i+4]) + r[i+4];
+        t[5]  = (tb * a[i+5]) + r[i+5];
+        t[6]  = (tb * a[i+6]) + r[i+6];
+        t[7]  = (tb * a[i+7]) + r[i+7];
+        r[i+0] = t[0] & 0x3ffffff;
+        t[1] += t[0] >> 26;
+        r[i+1] = t[1] & 0x3ffffff;
+        t[2] += t[1] >> 26;
+        r[i+2] = t[2] & 0x3ffffff;
+        t[3] += t[2] >> 26;
+        r[i+3] = t[3] & 0x3ffffff;
+        t[4] += t[3] >> 26;
+        r[i+4] = t[4] & 0x3ffffff;
+        t[5] += t[4] >> 26;
+        r[i+5] = t[5] & 0x3ffffff;
+        t[6] += t[5] >> 26;
+        r[i+6] = t[6] & 0x3ffffff;
+        t[7] += t[6] >> 26;
+        r[i+7] = t[7] & 0x3ffffff;
+        t[0]  = t[7] >> 26;
+    }
+    t[0] += (tb * a[80]) + r[80];
+    r[80] = t[0] & 0x3ffffff;
+    r[81] +=  (sp_digit)(t[0] >> 26);
+#endif /* !WOLFSSL_SP_LARGE_CODE */
+}
+
+/* Shift the result in the high 2048 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_4096_mont_shift_81(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int64 n = a[78] >> 20;
+    n += ((sp_int64)a[79]) << 6;
+    for (i = 0; i < 72; i += 8) {
+        r[i + 0] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 80]) << 6;
+        r[i + 1] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 81]) << 6;
+        r[i + 2] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 82]) << 6;
+        r[i + 3] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 83]) << 6;
+        r[i + 4] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 84]) << 6;
+        r[i + 5] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 85]) << 6;
+        r[i + 6] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 86]) << 6;
+        r[i + 7] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 87]) << 6;
+    }
+    r[72] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[152]) << 6;
+    r[73] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[153]) << 6;
+    r[74] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[154]) << 6;
+    r[75] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[155]) << 6;
+    r[76] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[156]) << 6;
+    r[77] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[157]) << 6;
+    r[78] = (sp_digit)n;
+    XMEMSET(&r[79], 0, sizeof(*r) * 79U);
+}
+
+/* Reduce the number back to 4096 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_4096_mont_reduce_81(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_4096_norm_81(a + 79);
+
+    for (i=0; i<78; i++) {
+        mu = (a[i] * mp) & 0x3ffffff;
+        sp_4096_mul_add_81(a+i, m, mu);
+        a[i+1] += a[i] >> 26;
+    }
+    mu = (a[i] * mp) & 0xfffffL;
+    sp_4096_mul_add_81(a+i, m, mu);
+    a[i+1] += a[i] >> 26;
+    a[i] &= 0x3ffffff;
+    sp_4096_mont_shift_81(a, a);
+    sp_4096_cond_sub_81(a, a, m, 0 - (((a[78] - m[78]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_4096_norm_81(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_4096_mont_mul_81(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_4096_mul_81(r, a, b);
+    sp_4096_mont_reduce_81(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_4096_mont_sqr_81(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_4096_sqr_81(r, a);
+    sp_4096_mont_reduce_81(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_d_81(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    sp_digit t2;
+    sp_int64 p[4];
+    int i;
+
+    for (i = 0; i < 80; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 3] = (sp_digit)t2;
+    }
+    t += tb * a[80];
+    r[80] = (sp_digit)(t & 0x3ffffff);
+    t >>= 26;
+    r[81] = (sp_digit)(t & 0x3ffffff);
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_4096_cond_add_81(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 72; i += 8) {
+        r[i + 0] = a[i + 0] + (b[i + 0] & m);
+        r[i + 1] = a[i + 1] + (b[i + 1] & m);
+        r[i + 2] = a[i + 2] + (b[i + 2] & m);
+        r[i + 3] = a[i + 3] + (b[i + 3] & m);
+        r[i + 4] = a[i + 4] + (b[i + 4] & m);
+        r[i + 5] = a[i + 5] + (b[i + 5] & m);
+        r[i + 6] = a[i + 6] + (b[i + 6] & m);
+        r[i + 7] = a[i + 7] + (b[i + 7] & m);
+    }
+    r[72] = a[72] + (b[72] & m);
+    r[73] = a[73] + (b[73] & m);
+    r[74] = a[74] + (b[74] & m);
+    r[75] = a[75] + (b[75] & m);
+    r[76] = a[76] + (b[76] & m);
+    r[77] = a[77] + (b[77] & m);
+    r[78] = a[78] + (b[78] & m);
+}
+
+SP_NOINLINE static void sp_4096_rshift_81(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<72; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (26 - n)) & 0x3ffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (26 - n)) & 0x3ffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (26 - n)) & 0x3ffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (26 - n)) & 0x3ffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (26 - n)) & 0x3ffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (26 - n)) & 0x3ffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (26 - n)) & 0x3ffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (26 - n)) & 0x3ffffff);
+    }
+    r[72] = (a[72] >> n) | ((a[73] << (26 - n)) & 0x3ffffff);
+    r[73] = (a[73] >> n) | ((a[74] << (26 - n)) & 0x3ffffff);
+    r[74] = (a[74] >> n) | ((a[75] << (26 - n)) & 0x3ffffff);
+    r[75] = (a[75] >> n) | ((a[76] << (26 - n)) & 0x3ffffff);
+    r[76] = (a[76] >> n) | ((a[77] << (26 - n)) & 0x3ffffff);
+    r[77] = (a[77] >> n) | ((a[78] << (26 - n)) & 0x3ffffff);
+    r[78] = (a[78] >> n) | ((a[79] << (26 - n)) & 0x3ffffff);
+    r[79] = (a[79] >> n) | ((a[80] << (26 - n)) & 0x3ffffff);
+    r[80] = a[80] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_32
-static WC_INLINE sp_digit sp_256_div_word_10(sp_digit d1, sp_digit d0,
+static WC_INLINE sp_digit sp_4096_div_word_81(sp_digit d1, sp_digit d0,
     sp_digit dv)
 {
     sp_digit d;
@@ -17715,7 +17304,7 @@ static WC_INLINE sp_digit sp_256_div_word_10(sp_digit d1, sp_digit d0,
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Full implementation.
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -17723,75 +17312,91 @@ static WC_INLINE sp_digit sp_256_div_word_10(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_256_div_10(const sp_digit* a, const sp_digit* d, 
+static int sp_4096_div_81(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
+    sp_int64 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 10 + 1];
+    sp_digit t1[4 * 81 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 10 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 81 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
-    if (err == MP_OKAY) {
-        t2 = t1 + 2 * 10;
+    (void)m;
 
-        dv = d[9];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 10U);
-        for (i=9; i>=0; i--) {
-            t1[10 + i] += t1[10 + i - 1] >> 26;
-            t1[10 + i - 1] &= 0x3ffffff;
+    if (err == MP_OKAY) {
+        t2 = t1 + 162 + 1;
+        sd = t2 + 81 + 1;
+
+        sp_4096_mul_d_81(sd, d, (sp_digit)1 << 6);
+        sp_4096_mul_d_162(t1, a, (sp_digit)1 << 6);
+        dv = sd[78];
+        t1[79 + 79] += t1[79 + 79 - 1] >> 26;
+        t1[79 + 79 - 1] &= 0x3ffffff;
+        for (i=79; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_32
-            d1 = t1[10 + i];
+            d1 = t1[79 + i];
             d1 <<= 26;
-            d1 += t1[10 + i - 1];
+            d1 += t1[79 + i - 1];
             r1 = (sp_digit)(d1 / dv);
 #else
-            r1 = sp_256_div_word_10(t1[10 + i], t1[10 + i - 1], dv);
+            r1 = sp_4096_div_word_81(t1[79 + i], t1[79 + i - 1], dv);
 #endif
 
-            sp_256_mul_d_10(t2, d, r1);
-            (void)sp_256_sub_10(&t1[i], &t1[i], t2);
-            sp_256_norm_10(&t1[i]);
-            t1[10 + i] -= t2[10];
-            t1[10 + i] += t1[10 + i - 1] >> 26;
-            t1[10 + i - 1] &= 0x3ffffff;
-            r1 = (((-t1[10 + i]) << 26) - t1[10 + i - 1]) / dv;
-            r1++;
-            sp_256_mul_d_10(t2, d, r1);
-            (void)sp_256_add_10(&t1[i], &t1[i], t2);
-            t1[10 + i] += t1[10 + i - 1] >> 26;
-            t1[10 + i - 1] &= 0x3ffffff;
+            sp_4096_mul_d_81(t2, sd, r1);
+            (void)sp_4096_sub_81(&t1[i], &t1[i], t2);
+            sp_4096_norm_79(&t1[i]);
+            t1[79 + i] += t1[79 + i - 1] >> 26;
+            t1[79 + i - 1] &= 0x3ffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[79 + i];
+            d1 <<= 26;
+            d1 -= t1[79 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_81(-t1[79 + i], -t1[79 + i - 1], dv);
+#endif
+            r1 -= t1[79 + i];
+            sp_4096_mul_d_81(t2, sd, r1);
+            (void)sp_4096_add_81(&t1[i], &t1[i], t2);
+            t1[79 + i] += t1[79 + i - 1] >> 26;
+            t1[79 + i - 1] &= 0x3ffffff;
         }
-        t1[10 - 1] += t1[10 - 2] >> 26;
-        t1[10 - 2] &= 0x3ffffff;
-        r1 = t1[10 - 1] / dv;
+        t1[79 - 1] += t1[79 - 2] >> 26;
+        t1[79 - 2] &= 0x3ffffff;
+        r1 = t1[79 - 1] / dv;
 
-        sp_256_mul_d_10(t2, d, r1);
-        (void)sp_256_sub_10(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 20U);
-        for (i=0; i<9; i++) {
+        sp_4096_mul_d_81(t2, sd, r1);
+        sp_4096_sub_81(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 162U);
+        for (i=0; i<78; i++) {
             r[i+1] += r[i] >> 26;
             r[i] &= 0x3ffffff;
         }
-        sp_256_cond_add_10(r, r, d, 0 - ((r[9] < 0) ?
+        sp_4096_cond_add_81(r, r, sd, 0 - ((r[78] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
+
+        sp_4096_norm_79(r);
+        sp_4096_rshift_81(r, r, 6);
+        r[79] = 0;
+        r[80] = 0;
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -17809,9 +17414,7786 @@ static int sp_256_div_10(const sp_digit* a, const sp_digit* d,
  * m  A single precision number that is the modulus to reduce with.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_256_mod_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static int sp_4096_mod_81(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
-    return sp_256_div_10(a, m, NULL, r);
+    return sp_4096_div_81(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_81(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 162];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 81 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 81 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 81U * 2U);
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_81(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_81(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 81U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_4096_mul_81(t[1], t[1], norm);
+        err = sp_4096_mod_81(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 26;
+        c = bits % 26;
+        n = e[i--] << (26 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 26;
+            }
+
+            y = (int)((n >> 25) & 1);
+            n <<= 1;
+
+            sp_4096_mont_mul_81(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 81 * 2);
+            sp_4096_mont_sqr_81(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 81 * 2);
+        }
+
+        sp_4096_mont_reduce_81(t[0], m, mp);
+        n = sp_4096_cmp_81(t[0], m);
+        sp_4096_cond_sub_81(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 81 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 162];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 81 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 81 * 2);
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_81(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_81(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_81(t[1], t[1], norm);
+                err = sp_4096_mod_81(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_4096_mul_81(t[1], a, norm);
+            err = sp_4096_mod_81(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 26;
+        c = bits % 26;
+        n = e[i--] << (26 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 26;
+            }
+
+            y = (int)((n >> 25) & 1);
+            n <<= 1;
+
+            sp_4096_mont_mul_81(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 81 * 2);
+            sp_4096_mont_sqr_81(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 81 * 2);
+        }
+
+        sp_4096_mont_reduce_81(t[0], m, mp);
+        n = sp_4096_cmp_81(t[0], m);
+        sp_4096_cond_sub_81(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 81 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(32 * 162) + 162];
+#endif
+    sp_digit* t[32];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 162) + 162), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<32; i++)
+            t[i] = td + i * 162;
+        rt = td + 5184;
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_81(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_81(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_81(t[1], t[1], norm);
+                err = sp_4096_mod_81(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_4096_mul_81(t[1], a, norm);
+            err = sp_4096_mod_81(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mont_sqr_81(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_81(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_81(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_81(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_81(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_81(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_4096_mont_sqr_81(t[ 8], t[ 4], m, mp);
+        sp_4096_mont_mul_81(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_4096_mont_sqr_81(t[10], t[ 5], m, mp);
+        sp_4096_mont_mul_81(t[11], t[ 6], t[ 5], m, mp);
+        sp_4096_mont_sqr_81(t[12], t[ 6], m, mp);
+        sp_4096_mont_mul_81(t[13], t[ 7], t[ 6], m, mp);
+        sp_4096_mont_sqr_81(t[14], t[ 7], m, mp);
+        sp_4096_mont_mul_81(t[15], t[ 8], t[ 7], m, mp);
+        sp_4096_mont_sqr_81(t[16], t[ 8], m, mp);
+        sp_4096_mont_mul_81(t[17], t[ 9], t[ 8], m, mp);
+        sp_4096_mont_sqr_81(t[18], t[ 9], m, mp);
+        sp_4096_mont_mul_81(t[19], t[10], t[ 9], m, mp);
+        sp_4096_mont_sqr_81(t[20], t[10], m, mp);
+        sp_4096_mont_mul_81(t[21], t[11], t[10], m, mp);
+        sp_4096_mont_sqr_81(t[22], t[11], m, mp);
+        sp_4096_mont_mul_81(t[23], t[12], t[11], m, mp);
+        sp_4096_mont_sqr_81(t[24], t[12], m, mp);
+        sp_4096_mont_mul_81(t[25], t[13], t[12], m, mp);
+        sp_4096_mont_sqr_81(t[26], t[13], m, mp);
+        sp_4096_mont_mul_81(t[27], t[14], t[13], m, mp);
+        sp_4096_mont_sqr_81(t[28], t[14], m, mp);
+        sp_4096_mont_mul_81(t[29], t[15], t[14], m, mp);
+        sp_4096_mont_sqr_81(t[30], t[15], m, mp);
+        sp_4096_mont_mul_81(t[31], t[16], t[15], m, mp);
+
+        bits = ((bits + 4) / 5) * 5;
+        i = ((bits + 25) / 26) - 1;
+        c = bits % 26;
+        if (c == 0) {
+            c = 26;
+        }
+        if (i < 81) {
+            n = e[i--] << (32 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 5) {
+            n |= e[i--] << (6 - c);
+            c += 26;
+        }
+        y = (int)((n >> 27) & 0x1f);
+        n <<= 5;
+        c -= 5;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 162);
+        while ((i >= 0) || (c >= 5)) {
+            if (c >= 5) {
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 6;
+                y = (byte)((n >> 27) & 0x1f);
+                n <<= 5;
+                c = 21;
+            }
+            else {
+                y = (byte)((n >> 27) & 0x1f);
+                n = e[i--] << 6;
+                c = 5 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 26 - c;
+            }
+
+            sp_4096_mont_sqr_81(rt, rt, m, mp);
+            sp_4096_mont_sqr_81(rt, rt, m, mp);
+            sp_4096_mont_sqr_81(rt, rt, m, mp);
+            sp_4096_mont_sqr_81(rt, rt, m, mp);
+            sp_4096_mont_sqr_81(rt, rt, m, mp);
+
+            sp_4096_mont_mul_81(rt, rt, t[y], m, mp);
+        }
+
+        sp_4096_mont_reduce_81(rt, m, mp);
+        n = sp_4096_cmp_81(rt, m);
+        sp_4096_cond_sub_81(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 162);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
+#endif /* (WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH) & !WOLFSSL_RSA_PUBLIC_ONLY */
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 4096 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_4096_mont_norm_162(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i = 0; i < 152; i += 8) {
+        r[i + 0] = 0x3ffffff;
+        r[i + 1] = 0x3ffffff;
+        r[i + 2] = 0x3ffffff;
+        r[i + 3] = 0x3ffffff;
+        r[i + 4] = 0x3ffffff;
+        r[i + 5] = 0x3ffffff;
+        r[i + 6] = 0x3ffffff;
+        r[i + 7] = 0x3ffffff;
+    }
+    r[152] = 0x3ffffff;
+    r[153] = 0x3ffffff;
+    r[154] = 0x3ffffff;
+    r[155] = 0x3ffffff;
+    r[156] = 0x3ffffff;
+    r[157] = 0x3fffL;
+    r[158] = 0;
+    r[159] = 0;
+    r[160] = 0;
+    r[161] = 0;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_4096_sub_162(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_4096_cmp_162(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    r |= (a[161] - b[161]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[160] - b[160]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    for (i = 152; i >= 0; i -= 8) {
+        r |= (a[i + 7] - b[i + 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 6] - b[i + 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 5] - b[i + 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 4] - b[i + 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 3] - b[i + 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 2] - b[i + 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+        r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_4096_cond_sub_162(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 160; i += 8) {
+        r[i + 0] = a[i + 0] - (b[i + 0] & m);
+        r[i + 1] = a[i + 1] - (b[i + 1] & m);
+        r[i + 2] = a[i + 2] - (b[i + 2] & m);
+        r[i + 3] = a[i + 3] - (b[i + 3] & m);
+        r[i + 4] = a[i + 4] - (b[i + 4] & m);
+        r[i + 5] = a[i + 5] - (b[i + 5] & m);
+        r[i + 6] = a[i + 6] - (b[i + 6] & m);
+        r[i + 7] = a[i + 7] - (b[i + 7] & m);
+    }
+    r[160] = a[160] - (b[160] & m);
+    r[161] = a[161] - (b[161] & m);
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_add_162(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 162; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0x3ffffff;
+        t >>= 26;
+    }
+    r[162] += (sp_digit)t;
+#else
+    sp_int64 tb = b;
+    sp_int64 t[8];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 160; i += 8) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        t[4]  = (tb * a[i+4]) + r[i+4];
+        t[5]  = (tb * a[i+5]) + r[i+5];
+        t[6]  = (tb * a[i+6]) + r[i+6];
+        t[7]  = (tb * a[i+7]) + r[i+7];
+        r[i+0] = t[0] & 0x3ffffff;
+        t[1] += t[0] >> 26;
+        r[i+1] = t[1] & 0x3ffffff;
+        t[2] += t[1] >> 26;
+        r[i+2] = t[2] & 0x3ffffff;
+        t[3] += t[2] >> 26;
+        r[i+3] = t[3] & 0x3ffffff;
+        t[4] += t[3] >> 26;
+        r[i+4] = t[4] & 0x3ffffff;
+        t[5] += t[4] >> 26;
+        r[i+5] = t[5] & 0x3ffffff;
+        t[6] += t[5] >> 26;
+        r[i+6] = t[6] & 0x3ffffff;
+        t[7] += t[6] >> 26;
+        r[i+7] = t[7] & 0x3ffffff;
+        t[0]  = t[7] >> 26;
+    }
+    t[0] += (tb * a[160]) + r[160];
+    t[1]  = (tb * a[161]) + r[161];
+    r[160] = t[0] & 0x3ffffff;
+    t[1] += t[0] >> 26;
+    r[161] = t[1] & 0x3ffffff;
+    r[162] +=  (sp_digit)(t[1] >> 26);
+#endif /* !WOLFSSL_SP_LARGE_CODE */
+}
+
+/* Shift the result in the high 4096 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_4096_mont_shift_162(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int64 n = a[157] >> 14;
+    n += ((sp_int64)a[158]) << 12;
+    for (i = 0; i < 152; i += 8) {
+        r[i + 0] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 159]) << 12;
+        r[i + 1] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 160]) << 12;
+        r[i + 2] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 161]) << 12;
+        r[i + 3] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 162]) << 12;
+        r[i + 4] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 163]) << 12;
+        r[i + 5] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 164]) << 12;
+        r[i + 6] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 165]) << 12;
+        r[i + 7] = n & 0x3ffffff;
+        n >>= 26; n += ((sp_int64)a[i + 166]) << 12;
+    }
+    r[152] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[311]) << 12;
+    r[153] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[312]) << 12;
+    r[154] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[313]) << 12;
+    r[155] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[314]) << 12;
+    r[156] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[315]) << 12;
+    r[157] = (sp_digit)n;
+    XMEMSET(&r[158], 0, sizeof(*r) * 158U);
+}
+
+/* Reduce the number back to 4096 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_4096_mont_reduce_162(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_4096_norm_162(a + 158);
+
+#ifdef WOLFSSL_SP_DH
+    if (mp != 1) {
+        for (i=0; i<157; i++) {
+            mu = (a[i] * mp) & 0x3ffffff;
+            sp_4096_mul_add_162(a+i, m, mu);
+            a[i+1] += a[i] >> 26;
+        }
+        mu = (a[i] * mp) & 0x3fffL;
+        sp_4096_mul_add_162(a+i, m, mu);
+        a[i+1] += a[i] >> 26;
+        a[i] &= 0x3ffffff;
+    }
+    else {
+        for (i=0; i<157; i++) {
+            mu = a[i] & 0x3ffffff;
+            sp_4096_mul_add_162(a+i, m, mu);
+            a[i+1] += a[i] >> 26;
+        }
+        mu = a[i] & 0x3fffL;
+        sp_4096_mul_add_162(a+i, m, mu);
+        a[i+1] += a[i] >> 26;
+        a[i] &= 0x3ffffff;
+    }
+#else
+    for (i=0; i<157; i++) {
+        mu = (a[i] * mp) & 0x3ffffff;
+        sp_4096_mul_add_162(a+i, m, mu);
+        a[i+1] += a[i] >> 26;
+    }
+    mu = (a[i] * mp) & 0x3fffL;
+    sp_4096_mul_add_162(a+i, m, mu);
+    a[i+1] += a[i] >> 26;
+    a[i] &= 0x3ffffff;
+#endif
+    sp_4096_mont_shift_162(a, a);
+    sp_4096_cond_sub_162(a, a, m, 0 - (((a[157] - m[157]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_4096_norm_162(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_4096_mont_mul_162(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_4096_mul_162(r, a, b);
+    sp_4096_mont_reduce_162(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_4096_mont_sqr_162(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_4096_sqr_162(r, a);
+    sp_4096_mont_reduce_162(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_d_324(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    sp_digit t2;
+    sp_int64 p[4];
+    int i;
+
+    for (i = 0; i < 324; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0x3ffffff);
+        t >>= 26;
+        r[i + 3] = (sp_digit)t2;
+    }
+    r[324] = (sp_digit)(t & 0x3ffffff);
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_4096_cond_add_162(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 152; i += 8) {
+        r[i + 0] = a[i + 0] + (b[i + 0] & m);
+        r[i + 1] = a[i + 1] + (b[i + 1] & m);
+        r[i + 2] = a[i + 2] + (b[i + 2] & m);
+        r[i + 3] = a[i + 3] + (b[i + 3] & m);
+        r[i + 4] = a[i + 4] + (b[i + 4] & m);
+        r[i + 5] = a[i + 5] + (b[i + 5] & m);
+        r[i + 6] = a[i + 6] + (b[i + 6] & m);
+        r[i + 7] = a[i + 7] + (b[i + 7] & m);
+    }
+    r[152] = a[152] + (b[152] & m);
+    r[153] = a[153] + (b[153] & m);
+    r[154] = a[154] + (b[154] & m);
+    r[155] = a[155] + (b[155] & m);
+    r[156] = a[156] + (b[156] & m);
+    r[157] = a[157] + (b[157] & m);
+}
+
+SP_NOINLINE static void sp_4096_rshift_162(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<160; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (26 - n)) & 0x3ffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (26 - n)) & 0x3ffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (26 - n)) & 0x3ffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (26 - n)) & 0x3ffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (26 - n)) & 0x3ffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (26 - n)) & 0x3ffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (26 - n)) & 0x3ffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (26 - n)) & 0x3ffffff);
+    }
+    r[160] = (a[160] >> n) | ((a[161] << (26 - n)) & 0x3ffffff);
+    r[161] = a[161] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_32
+static WC_INLINE sp_digit sp_4096_div_word_162(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 26 bits from d1 and top 5 bits from d0. */
+    d = (d1 << 5) + (d0 >> 21);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 6 bits in r */
+    /* Next 5 bits from d0. */
+    r <<= 5;
+    d <<= 5;
+    d += (d0 >> 16) & ((1 << 5) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 11 bits in r */
+    /* Next 5 bits from d0. */
+    r <<= 5;
+    d <<= 5;
+    d += (d0 >> 11) & ((1 << 5) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 16 bits in r */
+    /* Next 5 bits from d0. */
+    r <<= 5;
+    d <<= 5;
+    d += (d0 >> 6) & ((1 << 5) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 5 bits from d0. */
+    r <<= 5;
+    d <<= 5;
+    d += (d0 >> 1) & ((1 << 5) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 26 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 26 bits from d1 and top 5 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_32 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_4096_div_162(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_32
+    sp_int64 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 162 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 162 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 324 + 1;
+        sd = t2 + 162 + 1;
+
+        sp_4096_mul_d_162(sd, d, (sp_digit)1 << 12);
+        sp_4096_mul_d_324(t1, a, (sp_digit)1 << 12);
+        dv = sd[157];
+        t1[158 + 158] += t1[158 + 158 - 1] >> 26;
+        t1[158 + 158 - 1] &= 0x3ffffff;
+        for (i=158; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = t1[158 + i];
+            d1 <<= 26;
+            d1 += t1[158 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_162(t1[158 + i], t1[158 + i - 1], dv);
+#endif
+
+            sp_4096_mul_d_162(t2, sd, r1);
+            (void)sp_4096_sub_162(&t1[i], &t1[i], t2);
+            sp_4096_norm_158(&t1[i]);
+            t1[158 + i] += t1[158 + i - 1] >> 26;
+            t1[158 + i - 1] &= 0x3ffffff;
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[158 + i];
+            d1 <<= 26;
+            d1 -= t1[158 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_162(-t1[158 + i], -t1[158 + i - 1], dv);
+#endif
+            r1 -= t1[158 + i];
+            sp_4096_mul_d_162(t2, sd, r1);
+            (void)sp_4096_add_162(&t1[i], &t1[i], t2);
+            t1[158 + i] += t1[158 + i - 1] >> 26;
+            t1[158 + i - 1] &= 0x3ffffff;
+        }
+        t1[158 - 1] += t1[158 - 2] >> 26;
+        t1[158 - 2] &= 0x3ffffff;
+        r1 = t1[158 - 1] / dv;
+
+        sp_4096_mul_d_162(t2, sd, r1);
+        sp_4096_sub_162(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 324U);
+        for (i=0; i<157; i++) {
+            r[i+1] += r[i] >> 26;
+            r[i] &= 0x3ffffff;
+        }
+        sp_4096_cond_add_162(r, r, sd, 0 - ((r[157] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_4096_norm_158(r);
+        sp_4096_rshift_162(r, r, 12);
+        r[158] = 0;
+        r[159] = 0;
+        r[160] = 0;
+        r[161] = 0;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_4096_mod_162(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_4096_div_162(a, m, NULL, r);
+}
+
+#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
+                                                     defined(WOLFSSL_HAVE_SP_DH)
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_162(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 324];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 162 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 162 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 162U * 2U);
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_162(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_162(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 162U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_4096_mul_162(t[1], t[1], norm);
+        err = sp_4096_mod_162(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 26;
+        c = bits % 26;
+        n = e[i--] << (26 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 26;
+            }
+
+            y = (int)((n >> 25) & 1);
+            n <<= 1;
+
+            sp_4096_mont_mul_162(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 162 * 2);
+            sp_4096_mont_sqr_162(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 162 * 2);
+        }
+
+        sp_4096_mont_reduce_162(t[0], m, mp);
+        n = sp_4096_cmp_162(t[0], m);
+        sp_4096_cond_sub_162(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 162 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 324];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 162 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 162 * 2);
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_162(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_162(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_162(t[1], t[1], norm);
+                err = sp_4096_mod_162(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_4096_mul_162(t[1], a, norm);
+            err = sp_4096_mod_162(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 26;
+        c = bits % 26;
+        n = e[i--] << (26 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 26;
+            }
+
+            y = (int)((n >> 25) & 1);
+            n <<= 1;
+
+            sp_4096_mont_mul_162(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 162 * 2);
+            sp_4096_mont_sqr_162(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 162 * 2);
+        }
+
+        sp_4096_mont_reduce_162(t[0], m, mp);
+        n = sp_4096_cmp_162(t[0], m);
+        sp_4096_cond_sub_162(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 162 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(16 * 324) + 324];
+#endif
+    sp_digit* t[16];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 324) + 324), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<16; i++)
+            t[i] = td + i * 324;
+        rt = td + 5184;
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_162(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_162(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_162(t[1], t[1], norm);
+                err = sp_4096_mod_162(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_4096_mul_162(t[1], a, norm);
+            err = sp_4096_mod_162(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mont_sqr_162(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_162(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_162(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_162(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_162(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_162(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_4096_mont_sqr_162(t[ 8], t[ 4], m, mp);
+        sp_4096_mont_mul_162(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_4096_mont_sqr_162(t[10], t[ 5], m, mp);
+        sp_4096_mont_mul_162(t[11], t[ 6], t[ 5], m, mp);
+        sp_4096_mont_sqr_162(t[12], t[ 6], m, mp);
+        sp_4096_mont_mul_162(t[13], t[ 7], t[ 6], m, mp);
+        sp_4096_mont_sqr_162(t[14], t[ 7], m, mp);
+        sp_4096_mont_mul_162(t[15], t[ 8], t[ 7], m, mp);
+
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 25) / 26) - 1;
+        c = bits % 26;
+        if (c == 0) {
+            c = 26;
+        }
+        if (i < 162) {
+            n = e[i--] << (32 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 4) {
+            n |= e[i--] << (6 - c);
+            c += 26;
+        }
+        y = (int)((n >> 28) & 0xf);
+        n <<= 4;
+        c -= 4;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 324);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 6;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 22;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 6;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 26 - c;
+            }
+
+            sp_4096_mont_sqr_162(rt, rt, m, mp);
+            sp_4096_mont_sqr_162(rt, rt, m, mp);
+            sp_4096_mont_sqr_162(rt, rt, m, mp);
+            sp_4096_mont_sqr_162(rt, rt, m, mp);
+
+            sp_4096_mont_mul_162(rt, rt, t[y], m, mp);
+        }
+
+        sp_4096_mont_reduce_162(rt, m, mp);
+        n = sp_4096_cmp_162(rt, m);
+        sp_4096_cond_sub_162(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 324);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */
+       /* WOLFSSL_HAVE_SP_DH */
+
+#ifdef WOLFSSL_HAVE_SP_RSA
+/* RSA public key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * em      Public exponent.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 512 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
+    const mp_int* mm, byte* out, word32* outLen)
+{
+#ifdef WOLFSSL_SP_SMALL
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[162 * 5];
+#endif
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit* norm = NULL;
+    sp_digit e[1] = {0};
+    sp_digit mp;
+    int i;
+    int err = MP_OKAY;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 26) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 512U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        r = a + 162 * 2;
+        m = r + 162 * 2;
+        norm = r;
+
+        sp_4096_from_bin(a, 162, in, inLen);
+#if DIGIT_BIT >= 26
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(m, 162, mm);
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_162(norm, m);
+    }
+    if (err == MP_OKAY) {
+        sp_4096_mul_162(a, a, norm);
+        err = sp_4096_mod_162(a, a, m);
+    }
+    if (err == MP_OKAY) {
+        for (i=25; i>=0; i--) {
+            if ((e[0] >> i) != 0) {
+                break;
+            }
+        }
+
+        XMEMCPY(r, a, sizeof(sp_digit) * 162 * 2);
+        for (i--; i>=0; i--) {
+            sp_4096_mont_sqr_162(r, r, m, mp);
+
+            if (((e[0] >> i) & 1) == 1) {
+                sp_4096_mont_mul_162(r, r, a, m, mp);
+            }
+        }
+        sp_4096_mont_reduce_162(r, m, mp);
+        mp = sp_4096_cmp_162(r, m);
+        sp_4096_cond_sub_162(r, r, m, ((mp < 0) ?
+                    (sp_digit)1 : (sp_digit)0)- 1);
+
+        sp_4096_to_bin_162(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[162 * 5];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit e[1] = {0};
+    int err = MP_OKAY;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 26) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 512U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d;
+        r = a + 162 * 2;
+        m = r + 162 * 2;
+
+        sp_4096_from_bin(a, 162, in, inLen);
+#if DIGIT_BIT >= 26
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(m, 162, mm);
+
+        if (e[0] == 0x3) {
+            sp_4096_sqr_162(r, a);
+            err = sp_4096_mod_162(r, r, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_162(r, a, r);
+                err = sp_4096_mod_162(r, r, m);
+            }
+        }
+        else {
+            sp_digit* norm = r;
+            int i;
+            sp_digit mp;
+
+            sp_4096_mont_setup(m, &mp);
+            sp_4096_mont_norm_162(norm, m);
+
+            sp_4096_mul_162(a, a, norm);
+            err = sp_4096_mod_162(a, a, m);
+
+            if (err == MP_OKAY) {
+                for (i=25; i>=0; i--) {
+                    if ((e[0] >> i) != 0) {
+                        break;
+                    }
+                }
+
+                XMEMCPY(r, a, sizeof(sp_digit) * 324U);
+                for (i--; i>=0; i--) {
+                    sp_4096_mont_sqr_162(r, r, m, mp);
+
+                    if (((e[0] >> i) & 1) == 1) {
+                        sp_4096_mont_mul_162(r, r, a, m, mp);
+                    }
+                }
+                sp_4096_mont_reduce_162(r, m, mp);
+                mp = sp_4096_cmp_162(r, m);
+                sp_4096_cond_sub_162(r, r, m, ((mp < 0) ?
+                           (sp_digit)1 : (sp_digit)0) - 1);
+            }
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_to_bin_162(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+#ifndef WOLFSSL_RSA_PUBLIC_ONLY
+#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
+#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
+/* RSA private key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * dm      Private exponent.
+ * pm      First prime.
+ * qm      Second prime.
+ * dpm     First prime's CRT exponent.
+ * dqm     Second prime's CRT exponent.
+ * qim     Inverse of second prime mod p.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 512 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
+    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
+    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
+{
+#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit  d[162 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 4096) {
+           err = MP_READ_E;
+        }
+        else if (inLen > 512) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 162;
+        m = a + 324;
+        r = a;
+
+        sp_4096_from_bin(a, 162, in, inLen);
+        sp_4096_from_mp(d, 162, dm);
+        sp_4096_from_mp(m, 162, mm);
+        err = sp_4096_mod_exp_162(r, a, d, 4096, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_to_bin_162(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 162);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[162 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 4096) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 512U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 162;
+        m = a + 324;
+        r = a;
+
+        sp_4096_from_bin(a, 162, in, inLen);
+        sp_4096_from_mp(d, 162, dm);
+        sp_4096_from_mp(m, 162, mm);
+        err = sp_4096_mod_exp_162(r, a, d, 4096, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_to_bin_162(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 162);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#else
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[81 * 8];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 512) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 81 * 8, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+    if (err == MP_OKAY) {
+        p = a + 162;
+        qi = dq = dp = p + 81;
+        tmpa = qi + 81;
+        tmpb = tmpa + 162;
+        r = a;
+
+        sp_4096_from_bin(a, 162, in, inLen);
+        sp_4096_from_mp(p, 81, pm);
+        sp_4096_from_mp(dp, 81, dpm);
+        err = sp_4096_mod_exp_81(tmpa, a, dp, 2048, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(p, 81, qm);
+        sp_4096_from_mp(dq, 81, dqm);
+        err = sp_4096_mod_exp_81(tmpb, a, dq, 2048, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(p, 81, pm);
+        (void)sp_4096_sub_81(tmpa, tmpa, tmpb);
+        sp_4096_norm_79(tmpa);
+        sp_4096_cond_add_81(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[78] >> 31));
+        sp_4096_cond_add_81(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[78] >> 31));
+
+        sp_4096_from_mp(qi, 81, qim);
+        sp_4096_mul_81(tmpa, tmpa, qi);
+        err = sp_4096_mod_81(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(p, 81, qm);
+        sp_4096_mul_81(tmpa, p, tmpa);
+        (void)sp_4096_add_162(r, tmpb, tmpa);
+        sp_4096_norm_162(r);
+
+        sp_4096_to_bin_162(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 81 * 8);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[81 * 13];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* q = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 512U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 81 * 13, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        p = a + 162 * 2;
+        q = p + 81;
+        dp = q + 81;
+        dq = dp + 81;
+        qi = dq + 81;
+        tmpa = qi + 81;
+        tmpb = tmpa + 162;
+        r = a;
+
+        sp_4096_from_bin(a, 162, in, inLen);
+        sp_4096_from_mp(p, 81, pm);
+        sp_4096_from_mp(q, 81, qm);
+        sp_4096_from_mp(dp, 81, dpm);
+        sp_4096_from_mp(dq, 81, dqm);
+        sp_4096_from_mp(qi, 81, qim);
+
+        err = sp_4096_mod_exp_81(tmpa, a, dp, 2048, p, 1);
+    }
+    if (err == MP_OKAY) {
+        err = sp_4096_mod_exp_81(tmpb, a, dq, 2048, q, 1);
+    }
+
+    if (err == MP_OKAY) {
+        (void)sp_4096_sub_81(tmpa, tmpa, tmpb);
+        sp_4096_norm_79(tmpa);
+        sp_4096_cond_add_81(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[78] >> 31));
+        sp_4096_cond_add_81(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[78] >> 31));
+        sp_4096_mul_81(tmpa, tmpa, qi);
+        err = sp_4096_mod_81(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mul_81(tmpa, tmpa, q);
+        (void)sp_4096_add_162(r, tmpb, tmpa);
+        sp_4096_norm_162(r);
+
+        sp_4096_to_bin_162(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 81 * 13);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+    #endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
+}
+
+#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
+#endif /* WOLFSSL_HAVE_SP_RSA */
+#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
+                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
+/* Convert an array of sp_digit to an mp_int.
+ *
+ * a  A single precision integer.
+ * r  A multi-precision integer.
+ */
+static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
+{
+    int err;
+
+    err = mp_grow(r, (4096 + DIGIT_BIT - 1) / DIGIT_BIT);
+    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
+#if DIGIT_BIT == 26
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 162);
+        r->used = 162;
+        mp_clamp(r);
+#elif DIGIT_BIT < 26
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 162; i++) {
+            r->dp[j] |= (mp_digit)(a[i] << s);
+            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+            s = DIGIT_BIT - s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
+            while (s + DIGIT_BIT <= 26) {
+                s += DIGIT_BIT;
+                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+                if (s == SP_WORD_SIZE) {
+                    r->dp[j] = 0;
+                }
+                else {
+                    r->dp[j] = (mp_digit)(a[i] >> s);
+                }
+            }
+            s = 26 - s;
+        }
+        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#else
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 162; i++) {
+            r->dp[j] |= ((mp_digit)a[i]) << s;
+            if (s + 26 >= DIGIT_BIT) {
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
+                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+    #endif
+                s = DIGIT_BIT - s;
+                r->dp[++j] = a[i] >> s;
+                s = 26 - s;
+            }
+            else {
+                s += 26;
+            }
+        }
+        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#endif
+    }
+
+    return err;
+}
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base  Base. MP integer.
+ * exp   Exponent. MP integer.
+ * mod   Modulus. MP integer.
+ * res   Result. MP integer.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
+    mp_int* res)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int err = MP_OKAY;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[162 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 4096) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 162 * 2;
+        m = e + 162;
+        r = b;
+
+        sp_4096_from_mp(b, 162, base);
+        sp_4096_from_mp(e, 162, exp);
+        sp_4096_from_mp(m, 162, mod);
+
+        err = sp_4096_mod_exp_162(r, b, e, mp_count_bits(exp), m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_4096_to_mp(r, res);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 162U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[162 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 4096) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL, DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 162 * 2;
+        m = e + 162;
+        r = b;
+
+        sp_4096_from_mp(b, 162, base);
+        sp_4096_from_mp(e, 162, exp);
+        sp_4096_from_mp(m, 162, mod);
+
+        err = sp_4096_mod_exp_162(r, b, e, expBits, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_4096_to_mp(r, res);
+    }
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 162U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+#endif
+}
+
+#ifdef WOLFSSL_HAVE_SP_DH
+
+#ifdef HAVE_FFDHE_4096
+SP_NOINLINE static void sp_4096_lshift_162(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    sp_int_digit s;
+    sp_int_digit t;
+
+    s = (sp_int_digit)a[161];
+    r[162] = s >> (26U - n);
+    s = (sp_int_digit)(a[161]); t = (sp_int_digit)(a[160]);
+    r[161] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[160]); t = (sp_int_digit)(a[159]);
+    r[160] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[159]); t = (sp_int_digit)(a[158]);
+    r[159] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[158]); t = (sp_int_digit)(a[157]);
+    r[158] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[157]); t = (sp_int_digit)(a[156]);
+    r[157] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[156]); t = (sp_int_digit)(a[155]);
+    r[156] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[155]); t = (sp_int_digit)(a[154]);
+    r[155] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[154]); t = (sp_int_digit)(a[153]);
+    r[154] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[153]); t = (sp_int_digit)(a[152]);
+    r[153] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[152]); t = (sp_int_digit)(a[151]);
+    r[152] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[151]); t = (sp_int_digit)(a[150]);
+    r[151] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[150]); t = (sp_int_digit)(a[149]);
+    r[150] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[149]); t = (sp_int_digit)(a[148]);
+    r[149] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[148]); t = (sp_int_digit)(a[147]);
+    r[148] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[147]); t = (sp_int_digit)(a[146]);
+    r[147] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[146]); t = (sp_int_digit)(a[145]);
+    r[146] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[145]); t = (sp_int_digit)(a[144]);
+    r[145] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[144]); t = (sp_int_digit)(a[143]);
+    r[144] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[143]); t = (sp_int_digit)(a[142]);
+    r[143] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[142]); t = (sp_int_digit)(a[141]);
+    r[142] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[141]); t = (sp_int_digit)(a[140]);
+    r[141] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[140]); t = (sp_int_digit)(a[139]);
+    r[140] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[139]); t = (sp_int_digit)(a[138]);
+    r[139] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[138]); t = (sp_int_digit)(a[137]);
+    r[138] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[137]); t = (sp_int_digit)(a[136]);
+    r[137] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[136]); t = (sp_int_digit)(a[135]);
+    r[136] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[135]); t = (sp_int_digit)(a[134]);
+    r[135] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[134]); t = (sp_int_digit)(a[133]);
+    r[134] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[133]); t = (sp_int_digit)(a[132]);
+    r[133] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[132]); t = (sp_int_digit)(a[131]);
+    r[132] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[131]); t = (sp_int_digit)(a[130]);
+    r[131] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[130]); t = (sp_int_digit)(a[129]);
+    r[130] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[129]); t = (sp_int_digit)(a[128]);
+    r[129] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[128]); t = (sp_int_digit)(a[127]);
+    r[128] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[127]); t = (sp_int_digit)(a[126]);
+    r[127] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[126]); t = (sp_int_digit)(a[125]);
+    r[126] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[125]); t = (sp_int_digit)(a[124]);
+    r[125] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[124]); t = (sp_int_digit)(a[123]);
+    r[124] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[123]); t = (sp_int_digit)(a[122]);
+    r[123] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[122]); t = (sp_int_digit)(a[121]);
+    r[122] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[121]); t = (sp_int_digit)(a[120]);
+    r[121] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[120]); t = (sp_int_digit)(a[119]);
+    r[120] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[119]); t = (sp_int_digit)(a[118]);
+    r[119] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[118]); t = (sp_int_digit)(a[117]);
+    r[118] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[117]); t = (sp_int_digit)(a[116]);
+    r[117] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[116]); t = (sp_int_digit)(a[115]);
+    r[116] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[115]); t = (sp_int_digit)(a[114]);
+    r[115] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[114]); t = (sp_int_digit)(a[113]);
+    r[114] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[113]); t = (sp_int_digit)(a[112]);
+    r[113] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[112]); t = (sp_int_digit)(a[111]);
+    r[112] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[111]); t = (sp_int_digit)(a[110]);
+    r[111] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[110]); t = (sp_int_digit)(a[109]);
+    r[110] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[109]); t = (sp_int_digit)(a[108]);
+    r[109] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[108]); t = (sp_int_digit)(a[107]);
+    r[108] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[107]); t = (sp_int_digit)(a[106]);
+    r[107] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[106]); t = (sp_int_digit)(a[105]);
+    r[106] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[105]); t = (sp_int_digit)(a[104]);
+    r[105] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[104]); t = (sp_int_digit)(a[103]);
+    r[104] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[103]); t = (sp_int_digit)(a[102]);
+    r[103] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[102]); t = (sp_int_digit)(a[101]);
+    r[102] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[101]); t = (sp_int_digit)(a[100]);
+    r[101] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[100]); t = (sp_int_digit)(a[99]);
+    r[100] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[99]); t = (sp_int_digit)(a[98]);
+    r[99] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[98]); t = (sp_int_digit)(a[97]);
+    r[98] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[97]); t = (sp_int_digit)(a[96]);
+    r[97] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[96]); t = (sp_int_digit)(a[95]);
+    r[96] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[95]); t = (sp_int_digit)(a[94]);
+    r[95] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[94]); t = (sp_int_digit)(a[93]);
+    r[94] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[93]); t = (sp_int_digit)(a[92]);
+    r[93] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[92]); t = (sp_int_digit)(a[91]);
+    r[92] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[91]); t = (sp_int_digit)(a[90]);
+    r[91] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[90]); t = (sp_int_digit)(a[89]);
+    r[90] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[89]); t = (sp_int_digit)(a[88]);
+    r[89] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[88]); t = (sp_int_digit)(a[87]);
+    r[88] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[87]); t = (sp_int_digit)(a[86]);
+    r[87] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[86]); t = (sp_int_digit)(a[85]);
+    r[86] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[85]); t = (sp_int_digit)(a[84]);
+    r[85] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[84]); t = (sp_int_digit)(a[83]);
+    r[84] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[83]); t = (sp_int_digit)(a[82]);
+    r[83] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[82]); t = (sp_int_digit)(a[81]);
+    r[82] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[81]); t = (sp_int_digit)(a[80]);
+    r[81] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[80]); t = (sp_int_digit)(a[79]);
+    r[80] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[79]); t = (sp_int_digit)(a[78]);
+    r[79] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[78]); t = (sp_int_digit)(a[77]);
+    r[78] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[77]); t = (sp_int_digit)(a[76]);
+    r[77] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[76]); t = (sp_int_digit)(a[75]);
+    r[76] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[75]); t = (sp_int_digit)(a[74]);
+    r[75] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[74]); t = (sp_int_digit)(a[73]);
+    r[74] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[73]); t = (sp_int_digit)(a[72]);
+    r[73] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[72]); t = (sp_int_digit)(a[71]);
+    r[72] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[71]); t = (sp_int_digit)(a[70]);
+    r[71] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[70]); t = (sp_int_digit)(a[69]);
+    r[70] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[69]); t = (sp_int_digit)(a[68]);
+    r[69] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[68]); t = (sp_int_digit)(a[67]);
+    r[68] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[67]); t = (sp_int_digit)(a[66]);
+    r[67] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[66]); t = (sp_int_digit)(a[65]);
+    r[66] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[65]); t = (sp_int_digit)(a[64]);
+    r[65] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[64]); t = (sp_int_digit)(a[63]);
+    r[64] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[63]); t = (sp_int_digit)(a[62]);
+    r[63] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[62]); t = (sp_int_digit)(a[61]);
+    r[62] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[61]); t = (sp_int_digit)(a[60]);
+    r[61] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[60]); t = (sp_int_digit)(a[59]);
+    r[60] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[59]); t = (sp_int_digit)(a[58]);
+    r[59] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[58]); t = (sp_int_digit)(a[57]);
+    r[58] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[57]); t = (sp_int_digit)(a[56]);
+    r[57] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[56]); t = (sp_int_digit)(a[55]);
+    r[56] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[55]); t = (sp_int_digit)(a[54]);
+    r[55] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[54]); t = (sp_int_digit)(a[53]);
+    r[54] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[53]); t = (sp_int_digit)(a[52]);
+    r[53] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[52]); t = (sp_int_digit)(a[51]);
+    r[52] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[51]); t = (sp_int_digit)(a[50]);
+    r[51] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[50]); t = (sp_int_digit)(a[49]);
+    r[50] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[49]); t = (sp_int_digit)(a[48]);
+    r[49] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[48]); t = (sp_int_digit)(a[47]);
+    r[48] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[47]); t = (sp_int_digit)(a[46]);
+    r[47] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[46]); t = (sp_int_digit)(a[45]);
+    r[46] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[45]); t = (sp_int_digit)(a[44]);
+    r[45] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[44]); t = (sp_int_digit)(a[43]);
+    r[44] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[43]); t = (sp_int_digit)(a[42]);
+    r[43] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[42]); t = (sp_int_digit)(a[41]);
+    r[42] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[41]); t = (sp_int_digit)(a[40]);
+    r[41] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[40]); t = (sp_int_digit)(a[39]);
+    r[40] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[39]); t = (sp_int_digit)(a[38]);
+    r[39] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[38]); t = (sp_int_digit)(a[37]);
+    r[38] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[37]); t = (sp_int_digit)(a[36]);
+    r[37] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[36]); t = (sp_int_digit)(a[35]);
+    r[36] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[35]); t = (sp_int_digit)(a[34]);
+    r[35] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[34]); t = (sp_int_digit)(a[33]);
+    r[34] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[33]); t = (sp_int_digit)(a[32]);
+    r[33] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[32]); t = (sp_int_digit)(a[31]);
+    r[32] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[31]); t = (sp_int_digit)(a[30]);
+    r[31] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[30]); t = (sp_int_digit)(a[29]);
+    r[30] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
+    r[29] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
+    r[28] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
+    r[27] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
+    r[26] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
+    r[25] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
+    r[24] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
+    r[23] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
+    r[22] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
+    r[21] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
+    r[20] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
+    r[19] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
+    r[18] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
+    r[17] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
+    r[16] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
+    r[15] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
+    r[14] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
+    r[13] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
+    r[12] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
+    r[11] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
+    r[10] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
+    r[9] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
+    r[8] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
+    r[7] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
+    r[6] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
+    r[5] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
+    r[4] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
+    r[3] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
+    r[2] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
+    r[1] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    r[0] = (a[0] << n) & 0x3ffffff;
+}
+
+/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_2_162(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[487];
+#endif
+    sp_digit* norm = NULL;
+    sp_digit* tmp = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit o;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 487, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        tmp  = td + 324;
+        XMEMSET(td, 0, sizeof(sp_digit) * 487);
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_162(norm, m);
+
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 25) / 26) - 1;
+        c = bits % 26;
+        if (c == 0) {
+            c = 26;
+        }
+        if (i < 162) {
+            n = e[i--] << (32 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 4) {
+            n |= e[i--] << (6 - c);
+            c += 26;
+        }
+        y = (int)((n >> 28) & 0xf);
+        n <<= 4;
+        c -= 4;
+        sp_4096_lshift_162(r, norm, (byte)y);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 6;
+                y = (byte)((n >> 28) & 0xf);
+                n <<= 4;
+                c = 22;
+            }
+            else {
+                y = (byte)((n >> 28) & 0xf);
+                n = e[i--] << 6;
+                c = 4 - c;
+                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 26 - c;
+            }
+
+            sp_4096_mont_sqr_162(r, r, m, mp);
+            sp_4096_mont_sqr_162(r, r, m, mp);
+            sp_4096_mont_sqr_162(r, r, m, mp);
+            sp_4096_mont_sqr_162(r, r, m, mp);
+
+            sp_4096_lshift_162(r, r, (byte)y);
+            sp_4096_mul_d_162(tmp, norm, (r[158] << 12) + (r[157] >> 14));
+            r[158] = 0;
+            r[157] &= 0x3fffL;
+            (void)sp_4096_add_162(r, r, tmp);
+            sp_4096_norm_162(r);
+            o = sp_4096_cmp_162(r, m);
+            sp_4096_cond_sub_162(r, r, m, ((o < 0) ?
+                                          (sp_digit)1 : (sp_digit)0) - 1);
+        }
+
+        sp_4096_mont_reduce_162(r, m, mp);
+        n = sp_4096_cmp_162(r, m);
+        sp_4096_cond_sub_162(r, r, m, ((n < 0) ?
+                                                (sp_digit)1 : (sp_digit)0) - 1);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+#endif /* HAVE_FFDHE_4096 */
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base     Base.
+ * exp      Array of bytes that is the exponent.
+ * expLen   Length of data, in bytes, in exponent.
+ * mod      Modulus.
+ * out      Buffer to hold big-endian bytes of exponentiation result.
+ *          Must be at least 512 bytes long.
+ * outLen   Length, in bytes, of exponentiation result.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
+    const mp_int* mod, byte* out, word32* outLen)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[162 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    word32 i;
+    int err = MP_OKAY;
+
+    if (mp_count_bits(base) > 4096) {
+        err = MP_READ_E;
+    }
+    else if (expLen > 512U) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 162 * 2;
+        m = e + 162;
+        r = b;
+
+        sp_4096_from_mp(b, 162, base);
+        sp_4096_from_bin(e, 162, exp, expLen);
+        sp_4096_from_mp(m, 162, mod);
+
+    #ifdef HAVE_FFDHE_4096
+        if (base->used == 1 && base->dp[0] == 2U &&
+                ((m[157] << 2) | (m[156] >> 24)) == 0xffffL) {
+            err = sp_4096_mod_exp_2_162(r, e, expLen * 8U, m);
+        }
+        else {
+    #endif
+            err = sp_4096_mod_exp_162(r, b, e, expLen * 8U, m, 0);
+    #ifdef HAVE_FFDHE_4096
+        }
+    #endif
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_to_bin_162(r, out);
+        *outLen = 512;
+        for (i=0; i<512U && out[i] == 0U; i++) {
+            /* Search for first non-zero. */
+        }
+        *outLen -= i;
+        XMEMMOVE(out, out + i, *outLen);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 162U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+}
+#endif /* WOLFSSL_HAVE_SP_DH */
+
+#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
+
+#endif /* WOLFSSL_SP_SMALL */
+#endif /* WOLFSSL_SP_4096 */
+
+#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */
+#ifdef WOLFSSL_HAVE_SP_ECC
+#ifndef WOLFSSL_SP_NO_256
+
+/* Point structure to use. */
+typedef struct sp_point_256 {
+    /* X ordinate of point. */
+    sp_digit x[2 * 9];
+    /* Y ordinate of point. */
+    sp_digit y[2 * 9];
+    /* Z ordinate of point. */
+    sp_digit z[2 * 9];
+    /* Indicates point is at infinity. */
+    int infinity;
+} sp_point_256;
+
+/* The modulus (prime) of the curve P256. */
+static const sp_digit p256_mod[9] = {
+    0x1fffffff,0x1fffffff,0x1fffffff,0x000001ff,0x00000000,0x00000000,
+    0x00040000,0x1fe00000,0x00ffffff
+};
+/* The Montogmery normalizer for modulus of the curve P256. */
+static const sp_digit p256_norm_mod[9] = {
+    0x00000001,0x00000000,0x00000000,0x1ffffe00,0x1fffffff,0x1fffffff,
+    0x1ffbffff,0x001fffff,0x00000000
+};
+/* The Montogmery multiplier for modulus of the curve P256. */
+static const sp_digit p256_mp_mod = 0x0000001;
+#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
+                                            defined(HAVE_ECC_VERIFY)
+/* The order of the curve P256. */
+static const sp_digit p256_order[9] = {
+    0x1c632551,0x1dce5617,0x05e7a13c,0x0df55b4e,0x1ffffbce,0x1fffffff,
+    0x0003ffff,0x1fe00000,0x00ffffff
+};
+#endif
+/* The order of the curve P256 minus 2. */
+static const sp_digit p256_order2[9] = {
+    0x1c63254f,0x1dce5617,0x05e7a13c,0x0df55b4e,0x1ffffbce,0x1fffffff,
+    0x0003ffff,0x1fe00000,0x00ffffff
+};
+#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
+/* The Montogmery normalizer for order of the curve P256. */
+static const sp_digit p256_norm_order[9] = {
+    0x039cdaaf,0x0231a9e8,0x1a185ec3,0x120aa4b1,0x00000431,0x00000000,
+    0x1ffc0000,0x001fffff,0x00000000
+};
+#endif
+#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
+/* The Montogmery multiplier for order of the curve P256. */
+static const sp_digit p256_mp_order = 0xe00bc4f;
+#endif
+/* The base point of curve P256. */
+static const sp_point_256 p256_base = {
+    /* X ordinate */
+    {
+        0x1898c296,0x0509ca2e,0x1acce83d,0x06fb025b,0x040f2770,0x1372b1d2,
+        0x091fe2f3,0x1e5c2588,0x006b17d1,
+        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
+        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
+    },
+    /* Y ordinate */
+    {
+        0x17bf51f5,0x1db20341,0x0c57b3b2,0x1c66aed6,0x19e162bc,0x15a53e07,
+        0x1e6e3b9f,0x1c5fc34f,0x004fe342,
+        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
+        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
+    },
+    /* Z ordinate */
+    {
+        0x00000001,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
+        0x00000000,0x00000000,0x00000000,
+        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
+        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
+    },
+    /* infinity */
+    0
+};
+#if defined(HAVE_ECC_CHECK_KEY) || defined(HAVE_COMP_KEY)
+static const sp_digit p256_b[9] = {
+    0x07d2604b,0x1e71e1f1,0x14ec3d8e,0x1a0d6198,0x086bc651,0x1eaabb4c,
+    0x0f9ecfae,0x1b154752,0x005ac635
+};
+#endif
+
+#ifdef WOLFSSL_SP_SMALL
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_256_mul_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint64 c;
+    sp_uint64 lo;
+
+    c = ((sp_uint64)a[8]) * b[8];
+    r[17] = (sp_digit)(c >> 29);
+    c &= 0x1fffffff;
+    for (k = 15; k >= 0; k--) {
+        if (k >= 9) {
+            i = k - 8;
+            imax = 8;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint64)a[i]) * b[k - i];
+        }
+        c += lo >> 29;
+        r[k + 2] += (sp_digit)(c >> 29);
+        r[k + 1]  = (sp_digit)(c & 0x1fffffff);
+        c = lo & 0x1fffffff;
+    }
+    r[0] = (sp_digit)c;
+}
+
+#else
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_256_mul_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    sp_int64 t0   = ((sp_int64)a[ 0]) * b[ 0];
+    sp_int64 t1   = ((sp_int64)a[ 0]) * b[ 1]
+                 + ((sp_int64)a[ 1]) * b[ 0];
+    sp_int64 t2   = ((sp_int64)a[ 0]) * b[ 2]
+                 + ((sp_int64)a[ 1]) * b[ 1]
+                 + ((sp_int64)a[ 2]) * b[ 0];
+    sp_int64 t3   = ((sp_int64)a[ 0]) * b[ 3]
+                 + ((sp_int64)a[ 1]) * b[ 2]
+                 + ((sp_int64)a[ 2]) * b[ 1]
+                 + ((sp_int64)a[ 3]) * b[ 0];
+    sp_int64 t4   = ((sp_int64)a[ 0]) * b[ 4]
+                 + ((sp_int64)a[ 1]) * b[ 3]
+                 + ((sp_int64)a[ 2]) * b[ 2]
+                 + ((sp_int64)a[ 3]) * b[ 1]
+                 + ((sp_int64)a[ 4]) * b[ 0];
+    sp_int64 t5   = ((sp_int64)a[ 0]) * b[ 5]
+                 + ((sp_int64)a[ 1]) * b[ 4]
+                 + ((sp_int64)a[ 2]) * b[ 3]
+                 + ((sp_int64)a[ 3]) * b[ 2]
+                 + ((sp_int64)a[ 4]) * b[ 1]
+                 + ((sp_int64)a[ 5]) * b[ 0];
+    sp_int64 t6   = ((sp_int64)a[ 0]) * b[ 6]
+                 + ((sp_int64)a[ 1]) * b[ 5]
+                 + ((sp_int64)a[ 2]) * b[ 4]
+                 + ((sp_int64)a[ 3]) * b[ 3]
+                 + ((sp_int64)a[ 4]) * b[ 2]
+                 + ((sp_int64)a[ 5]) * b[ 1]
+                 + ((sp_int64)a[ 6]) * b[ 0];
+    sp_int64 t7   = ((sp_int64)a[ 0]) * b[ 7]
+                 + ((sp_int64)a[ 1]) * b[ 6]
+                 + ((sp_int64)a[ 2]) * b[ 5]
+                 + ((sp_int64)a[ 3]) * b[ 4]
+                 + ((sp_int64)a[ 4]) * b[ 3]
+                 + ((sp_int64)a[ 5]) * b[ 2]
+                 + ((sp_int64)a[ 6]) * b[ 1]
+                 + ((sp_int64)a[ 7]) * b[ 0];
+    sp_int64 t8   = ((sp_int64)a[ 0]) * b[ 8]
+                 + ((sp_int64)a[ 1]) * b[ 7]
+                 + ((sp_int64)a[ 2]) * b[ 6]
+                 + ((sp_int64)a[ 3]) * b[ 5]
+                 + ((sp_int64)a[ 4]) * b[ 4]
+                 + ((sp_int64)a[ 5]) * b[ 3]
+                 + ((sp_int64)a[ 6]) * b[ 2]
+                 + ((sp_int64)a[ 7]) * b[ 1]
+                 + ((sp_int64)a[ 8]) * b[ 0];
+    sp_int64 t9   = ((sp_int64)a[ 1]) * b[ 8]
+                 + ((sp_int64)a[ 2]) * b[ 7]
+                 + ((sp_int64)a[ 3]) * b[ 6]
+                 + ((sp_int64)a[ 4]) * b[ 5]
+                 + ((sp_int64)a[ 5]) * b[ 4]
+                 + ((sp_int64)a[ 6]) * b[ 3]
+                 + ((sp_int64)a[ 7]) * b[ 2]
+                 + ((sp_int64)a[ 8]) * b[ 1];
+    sp_int64 t10  = ((sp_int64)a[ 2]) * b[ 8]
+                 + ((sp_int64)a[ 3]) * b[ 7]
+                 + ((sp_int64)a[ 4]) * b[ 6]
+                 + ((sp_int64)a[ 5]) * b[ 5]
+                 + ((sp_int64)a[ 6]) * b[ 4]
+                 + ((sp_int64)a[ 7]) * b[ 3]
+                 + ((sp_int64)a[ 8]) * b[ 2];
+    sp_int64 t11  = ((sp_int64)a[ 3]) * b[ 8]
+                 + ((sp_int64)a[ 4]) * b[ 7]
+                 + ((sp_int64)a[ 5]) * b[ 6]
+                 + ((sp_int64)a[ 6]) * b[ 5]
+                 + ((sp_int64)a[ 7]) * b[ 4]
+                 + ((sp_int64)a[ 8]) * b[ 3];
+    sp_int64 t12  = ((sp_int64)a[ 4]) * b[ 8]
+                 + ((sp_int64)a[ 5]) * b[ 7]
+                 + ((sp_int64)a[ 6]) * b[ 6]
+                 + ((sp_int64)a[ 7]) * b[ 5]
+                 + ((sp_int64)a[ 8]) * b[ 4];
+    sp_int64 t13  = ((sp_int64)a[ 5]) * b[ 8]
+                 + ((sp_int64)a[ 6]) * b[ 7]
+                 + ((sp_int64)a[ 7]) * b[ 6]
+                 + ((sp_int64)a[ 8]) * b[ 5];
+    sp_int64 t14  = ((sp_int64)a[ 6]) * b[ 8]
+                 + ((sp_int64)a[ 7]) * b[ 7]
+                 + ((sp_int64)a[ 8]) * b[ 6];
+    sp_int64 t15  = ((sp_int64)a[ 7]) * b[ 8]
+                 + ((sp_int64)a[ 8]) * b[ 7];
+    sp_int64 t16  = ((sp_int64)a[ 8]) * b[ 8];
+
+    t1   += t0  >> 29; r[ 0] = t0  & 0x1fffffff;
+    t2   += t1  >> 29; r[ 1] = t1  & 0x1fffffff;
+    t3   += t2  >> 29; r[ 2] = t2  & 0x1fffffff;
+    t4   += t3  >> 29; r[ 3] = t3  & 0x1fffffff;
+    t5   += t4  >> 29; r[ 4] = t4  & 0x1fffffff;
+    t6   += t5  >> 29; r[ 5] = t5  & 0x1fffffff;
+    t7   += t6  >> 29; r[ 6] = t6  & 0x1fffffff;
+    t8   += t7  >> 29; r[ 7] = t7  & 0x1fffffff;
+    t9   += t8  >> 29; r[ 8] = t8  & 0x1fffffff;
+    t10  += t9  >> 29; r[ 9] = t9  & 0x1fffffff;
+    t11  += t10 >> 29; r[10] = t10 & 0x1fffffff;
+    t12  += t11 >> 29; r[11] = t11 & 0x1fffffff;
+    t13  += t12 >> 29; r[12] = t12 & 0x1fffffff;
+    t14  += t13 >> 29; r[13] = t13 & 0x1fffffff;
+    t15  += t14 >> 29; r[14] = t14 & 0x1fffffff;
+    t16  += t15 >> 29; r[15] = t15 & 0x1fffffff;
+    r[17] = (sp_digit)(t16 >> 29);
+                       r[16] = t16 & 0x1fffffff;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
+#ifdef WOLFSSL_SP_SMALL
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_256_sqr_9(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint64 c;
+    sp_uint64 t;
+
+    c = ((sp_uint64)a[8]) * a[8];
+    r[17] = (sp_digit)(c >> 29);
+    c = (c & 0x1fffffff) << 29;
+    for (k = 15; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
+        }
+        if (k < 8) {
+            imax = k;
+        }
+        else {
+            imax = 8;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint64)a[i]) * a[k - i];
+        }
+        c += t * 2;
+
+        r[k + 2] += (sp_digit) (c >> 58);
+        r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
+        c = (c & 0x1fffffff) << 29;
+    }
+    r[0] = (sp_digit)(c >> 29);
+}
+
+#else
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_256_sqr_9(sp_digit* r, const sp_digit* a)
+{
+    sp_int64 t0   =  ((sp_int64)a[ 0]) * a[ 0];
+    sp_int64 t1   = (((sp_int64)a[ 0]) * a[ 1]) * 2;
+    sp_int64 t2   = (((sp_int64)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_int64)a[ 1]) * a[ 1];
+    sp_int64 t3   = (((sp_int64)a[ 0]) * a[ 3]
+                 +  ((sp_int64)a[ 1]) * a[ 2]) * 2;
+    sp_int64 t4   = (((sp_int64)a[ 0]) * a[ 4]
+                 +  ((sp_int64)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_int64)a[ 2]) * a[ 2];
+    sp_int64 t5   = (((sp_int64)a[ 0]) * a[ 5]
+                 +  ((sp_int64)a[ 1]) * a[ 4]
+                 +  ((sp_int64)a[ 2]) * a[ 3]) * 2;
+    sp_int64 t6   = (((sp_int64)a[ 0]) * a[ 6]
+                 +  ((sp_int64)a[ 1]) * a[ 5]
+                 +  ((sp_int64)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_int64)a[ 3]) * a[ 3];
+    sp_int64 t7   = (((sp_int64)a[ 0]) * a[ 7]
+                 +  ((sp_int64)a[ 1]) * a[ 6]
+                 +  ((sp_int64)a[ 2]) * a[ 5]
+                 +  ((sp_int64)a[ 3]) * a[ 4]) * 2;
+    sp_int64 t8   = (((sp_int64)a[ 0]) * a[ 8]
+                 +  ((sp_int64)a[ 1]) * a[ 7]
+                 +  ((sp_int64)a[ 2]) * a[ 6]
+                 +  ((sp_int64)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_int64)a[ 4]) * a[ 4];
+    sp_int64 t9   = (((sp_int64)a[ 1]) * a[ 8]
+                 +  ((sp_int64)a[ 2]) * a[ 7]
+                 +  ((sp_int64)a[ 3]) * a[ 6]
+                 +  ((sp_int64)a[ 4]) * a[ 5]) * 2;
+    sp_int64 t10  = (((sp_int64)a[ 2]) * a[ 8]
+                 +  ((sp_int64)a[ 3]) * a[ 7]
+                 +  ((sp_int64)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_int64)a[ 5]) * a[ 5];
+    sp_int64 t11  = (((sp_int64)a[ 3]) * a[ 8]
+                 +  ((sp_int64)a[ 4]) * a[ 7]
+                 +  ((sp_int64)a[ 5]) * a[ 6]) * 2;
+    sp_int64 t12  = (((sp_int64)a[ 4]) * a[ 8]
+                 +  ((sp_int64)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_int64)a[ 6]) * a[ 6];
+    sp_int64 t13  = (((sp_int64)a[ 5]) * a[ 8]
+                 +  ((sp_int64)a[ 6]) * a[ 7]) * 2;
+    sp_int64 t14  = (((sp_int64)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_int64)a[ 7]) * a[ 7];
+    sp_int64 t15  = (((sp_int64)a[ 7]) * a[ 8]) * 2;
+    sp_int64 t16  =  ((sp_int64)a[ 8]) * a[ 8];
+
+    t1   += t0  >> 29; r[ 0] = t0  & 0x1fffffff;
+    t2   += t1  >> 29; r[ 1] = t1  & 0x1fffffff;
+    t3   += t2  >> 29; r[ 2] = t2  & 0x1fffffff;
+    t4   += t3  >> 29; r[ 3] = t3  & 0x1fffffff;
+    t5   += t4  >> 29; r[ 4] = t4  & 0x1fffffff;
+    t6   += t5  >> 29; r[ 5] = t5  & 0x1fffffff;
+    t7   += t6  >> 29; r[ 6] = t6  & 0x1fffffff;
+    t8   += t7  >> 29; r[ 7] = t7  & 0x1fffffff;
+    t9   += t8  >> 29; r[ 8] = t8  & 0x1fffffff;
+    t10  += t9  >> 29; r[ 9] = t9  & 0x1fffffff;
+    t11  += t10 >> 29; r[10] = t10 & 0x1fffffff;
+    t12  += t11 >> 29; r[11] = t11 & 0x1fffffff;
+    t13  += t12 >> 29; r[12] = t12 & 0x1fffffff;
+    t14  += t13 >> 29; r[13] = t13 & 0x1fffffff;
+    t15  += t14 >> 29; r[14] = t14 & 0x1fffffff;
+    t16  += t15 >> 29; r[15] = t15 & 0x1fffffff;
+    r[17] = (sp_digit)(t16 >> 29);
+                       r[16] = t16 & 0x1fffffff;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
+#ifdef WOLFSSL_SP_SMALL
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_256_add_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 9; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+#else
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_256_add_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    r[ 0] = a[ 0] + b[ 0];
+    r[ 1] = a[ 1] + b[ 1];
+    r[ 2] = a[ 2] + b[ 2];
+    r[ 3] = a[ 3] + b[ 3];
+    r[ 4] = a[ 4] + b[ 4];
+    r[ 5] = a[ 5] + b[ 5];
+    r[ 6] = a[ 6] + b[ 6];
+    r[ 7] = a[ 7] + b[ 7];
+    r[ 8] = a[ 8] + b[ 8];
+
+    return 0;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
+#ifdef WOLFSSL_SP_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_256_sub_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 9; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
+#else
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_256_sub_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    r[ 0] = a[ 0] - b[ 0];
+    r[ 1] = a[ 1] - b[ 1];
+    r[ 2] = a[ 2] - b[ 2];
+    r[ 3] = a[ 3] - b[ 3];
+    r[ 4] = a[ 4] - b[ 4];
+    r[ 5] = a[ 5] - b[ 5];
+    r[ 6] = a[ 6] - b[ 6];
+    r[ 7] = a[ 7] - b[ 7];
+    r[ 8] = a[ 8] - b[ 8];
+
+    return 0;
+}
+
+#endif /* WOLFSSL_SP_SMALL */
+/* Convert an mp_int to an array of sp_digit.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  A multi-precision integer.
+ */
+static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a)
+{
+#if DIGIT_BIT == 29
+    int j;
+
+    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);
+
+    for (j = a->used; j < size; j++) {
+        r[j] = 0;
+    }
+#elif DIGIT_BIT > 29
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i] << s);
+        r[j] &= 0x1fffffff;
+        s = 29U - s;
+        if (j + 1 >= size) {
+            break;
+        }
+        /* lint allow cast of mismatch word32 and mp_digit */
+        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+        while ((s + 29U) <= (word32)DIGIT_BIT) {
+            s += 29U;
+            r[j] &= 0x1fffffff;
+            if (j + 1 >= size) {
+                break;
+            }
+            if (s < (word32)DIGIT_BIT) {
+                /* lint allow cast of mismatch word32 and mp_digit */
+                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+            }
+            else {
+                r[++j] = (sp_digit)0;
+            }
+        }
+        s = (word32)DIGIT_BIT - s;
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#else
+    int i;
+    int j = 0;
+    int s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i]) << s;
+        if (s + DIGIT_BIT >= 29) {
+            r[j] &= 0x1fffffff;
+            if (j + 1 >= size) {
+                break;
+            }
+            s = 29 - s;
+            if (s == DIGIT_BIT) {
+                r[++j] = 0;
+                s = 0;
+            }
+            else {
+                r[++j] = a->dp[i] >> s;
+                s = DIGIT_BIT - s;
+            }
+        }
+        else {
+            s += DIGIT_BIT;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#endif
+}
+
+/* Convert a point of type ecc_point to type sp_point_256.
+ *
+ * p   Point of type sp_point_256 (result).
+ * pm  Point of type ecc_point.
+ */
+static void sp_256_point_from_ecc_point_9(sp_point_256* p,
+        const ecc_point* pm)
+{
+    XMEMSET(p->x, 0, sizeof(p->x));
+    XMEMSET(p->y, 0, sizeof(p->y));
+    XMEMSET(p->z, 0, sizeof(p->z));
+    sp_256_from_mp(p->x, 9, pm->x);
+    sp_256_from_mp(p->y, 9, pm->y);
+    sp_256_from_mp(p->z, 9, pm->z);
+    p->infinity = 0;
+}
+
+/* Convert an array of sp_digit to an mp_int.
+ *
+ * a  A single precision integer.
+ * r  A multi-precision integer.
+ */
+static int sp_256_to_mp(const sp_digit* a, mp_int* r)
+{
+    int err;
+
+    err = mp_grow(r, (256 + DIGIT_BIT - 1) / DIGIT_BIT);
+    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
+#if DIGIT_BIT == 29
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 9);
+        r->used = 9;
+        mp_clamp(r);
+#elif DIGIT_BIT < 29
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 9; i++) {
+            r->dp[j] |= (mp_digit)(a[i] << s);
+            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+            s = DIGIT_BIT - s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
+            while (s + DIGIT_BIT <= 29) {
+                s += DIGIT_BIT;
+                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+                if (s == SP_WORD_SIZE) {
+                    r->dp[j] = 0;
+                }
+                else {
+                    r->dp[j] = (mp_digit)(a[i] >> s);
+                }
+            }
+            s = 29 - s;
+        }
+        r->used = (256 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#else
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 9; i++) {
+            r->dp[j] |= ((mp_digit)a[i]) << s;
+            if (s + 29 >= DIGIT_BIT) {
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
+                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+    #endif
+                s = DIGIT_BIT - s;
+                r->dp[++j] = a[i] >> s;
+                s = 29 - s;
+            }
+            else {
+                s += 29;
+            }
+        }
+        r->used = (256 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#endif
+    }
+
+    return err;
+}
+
+/* Convert a point of type sp_point_256 to type ecc_point.
+ *
+ * p   Point of type sp_point_256.
+ * pm  Point of type ecc_point (result).
+ * returns MEMORY_E when allocation of memory in ecc_point fails otherwise
+ * MP_OKAY.
+ */
+static int sp_256_point_to_ecc_point_9(const sp_point_256* p, ecc_point* pm)
+{
+    int err;
+
+    err = sp_256_to_mp(p->x, pm->x);
+    if (err == MP_OKAY) {
+        err = sp_256_to_mp(p->y, pm->y);
+    }
+    if (err == MP_OKAY) {
+        err = sp_256_to_mp(p->z, pm->z);
+    }
+
+    return err;
+}
+
+#define sp_256_mont_reduce_order_9         sp_256_mont_reduce_9
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_256_cmp_9(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+
+    for (i=8; i>=0; i--) {
+        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+#else
+    r |= (a[ 8] - b[ 8]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[ 7] - b[ 7]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[ 6] - b[ 6]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[ 5] - b[ 5]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[ 4] - b[ 4]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[ 3] - b[ 3]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[ 2] - b[ 2]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[ 1] - b[ 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    r |= (a[ 0] - b[ 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+#endif /* WOLFSSL_SP_SMALL */
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_256_cond_sub_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+
+    for (i = 0; i < 9; i++) {
+        r[i] = a[i] - (b[i] & m);
+    }
+#else
+    r[ 0] = a[ 0] - (b[ 0] & m);
+    r[ 1] = a[ 1] - (b[ 1] & m);
+    r[ 2] = a[ 2] - (b[ 2] & m);
+    r[ 3] = a[ 3] - (b[ 3] & m);
+    r[ 4] = a[ 4] - (b[ 4] & m);
+    r[ 5] = a[ 5] - (b[ 5] & m);
+    r[ 6] = a[ 6] - (b[ 6] & m);
+    r[ 7] = a[ 7] - (b[ 7] & m);
+    r[ 8] = a[ 8] - (b[ 8] & m);
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_256_mul_add_9(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+#ifndef WOLFSSL_SP_LARGE_CODE
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 9; i++) {
+        t += r[i];
+        t += tb * a[i];
+        r[i] = ((sp_digit)t) & 0x1fffffff;
+        t >>= 29;
+    }
+    r[9] += (sp_digit)t;
+#else
+#ifdef WOLFSSL_SP_SMALL
+    sp_int64 tb = b;
+    sp_int64 t[4];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 8; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[0]  = t[3] >> 29;
+    }
+    t[0] += (tb * a[8]) + r[8];
+    r[8] = t[0] & 0x1fffffff;
+    r[9] +=  (sp_digit)(t[0] >> 29);
+#else
+    sp_int64 tb = b;
+    sp_int64 t[8];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 8; i += 8) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        t[4]  = (tb * a[i+4]) + r[i+4];
+        t[5]  = (tb * a[i+5]) + r[i+5];
+        t[6]  = (tb * a[i+6]) + r[i+6];
+        t[7]  = (tb * a[i+7]) + r[i+7];
+        r[i+0] = t[0] & 0x1fffffff;
+        t[1] += t[0] >> 29;
+        r[i+1] = t[1] & 0x1fffffff;
+        t[2] += t[1] >> 29;
+        r[i+2] = t[2] & 0x1fffffff;
+        t[3] += t[2] >> 29;
+        r[i+3] = t[3] & 0x1fffffff;
+        t[4] += t[3] >> 29;
+        r[i+4] = t[4] & 0x1fffffff;
+        t[5] += t[4] >> 29;
+        r[i+5] = t[5] & 0x1fffffff;
+        t[6] += t[5] >> 29;
+        r[i+6] = t[6] & 0x1fffffff;
+        t[7] += t[6] >> 29;
+        r[i+7] = t[7] & 0x1fffffff;
+        t[0]  = t[7] >> 29;
+    }
+    t[0] += (tb * a[8]) + r[8];
+    r[8] = t[0] & 0x1fffffff;
+    r[9] +=  (sp_digit)(t[0] >> 29);
+#endif /* WOLFSSL_SP_SMALL */
+#endif /* !WOLFSSL_SP_LARGE_CODE */
+}
+
+/* Normalize the values in each word to 29 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_256_norm_9(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 8; i++) {
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+#else
+    a[1] += a[0] >> 29; a[0] &= 0x1fffffff;
+    a[2] += a[1] >> 29; a[1] &= 0x1fffffff;
+    a[3] += a[2] >> 29; a[2] &= 0x1fffffff;
+    a[4] += a[3] >> 29; a[3] &= 0x1fffffff;
+    a[5] += a[4] >> 29; a[4] &= 0x1fffffff;
+    a[6] += a[5] >> 29; a[5] &= 0x1fffffff;
+    a[7] += a[6] >> 29; a[6] &= 0x1fffffff;
+    a[8] += a[7] >> 29; a[7] &= 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+/* Shift the result in the high 256 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_256_mont_shift_9(sp_digit* r, const sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    sp_int64 n = a[8] >> 24;
+    n += ((sp_int64)a[9]) << 5;
+
+    for (i = 0; i < 8; i++) {
+        r[i] = n & 0x1fffffff;
+        n >>= 29;
+        n += ((sp_int64)a[10 + i]) << 5;
+    }
+    r[8] = (sp_digit)n;
+#else
+    sp_int64 n = a[8] >> 24;
+    n += ((sp_int64)a[9]) << 5;
+    r[ 0] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[10]) << 5;
+    r[ 1] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[11]) << 5;
+    r[ 2] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[12]) << 5;
+    r[ 3] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[13]) << 5;
+    r[ 4] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[14]) << 5;
+    r[ 5] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[15]) << 5;
+    r[ 6] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[16]) << 5;
+    r[ 7] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[17]) << 5;
+    r[8] = (sp_digit)n;
+#endif /* WOLFSSL_SP_SMALL */
+    XMEMSET(&r[9], 0, sizeof(*r) * 9U);
+}
+
+/* Reduce the number back to 256 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_256_mont_reduce_9(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    if (mp != 1) {
+        for (i=0; i<8; i++) {
+            mu = (a[i] * mp) & 0x1fffffff;
+            sp_256_mul_add_9(a+i, m, mu);
+            a[i+1] += a[i] >> 29;
+        }
+        mu = (a[i] * mp) & 0xffffffL;
+        sp_256_mul_add_9(a+i, m, mu);
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+    else {
+        for (i=0; i<8; i++) {
+            mu = a[i] & 0x1fffffff;
+            sp_256_mul_add_9(a+i, p256_mod, mu);
+            a[i+1] += a[i] >> 29;
+        }
+        mu = a[i] & 0xffffffL;
+        sp_256_mul_add_9(a+i, p256_mod, mu);
+        a[i+1] += a[i] >> 29;
+        a[i] &= 0x1fffffff;
+    }
+
+    sp_256_mont_shift_9(a, a);
+    sp_256_cond_sub_9(a, a, m, 0 - (((a[8] >> 24) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_256_norm_9(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_256_mont_mul_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_256_mul_9(r, a, b);
+    sp_256_mont_reduce_9(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_256_mont_sqr_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_256_sqr_9(r, a);
+    sp_256_mont_reduce_9(r, m, mp);
+}
+
+#if !defined(WOLFSSL_SP_SMALL) || defined(HAVE_COMP_KEY)
+/* Square the Montgomery form number a number of times. (r = a ^ n mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * n   Number of times to square.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_256_mont_sqr_n_9(sp_digit* r, const sp_digit* a, int n,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_256_mont_sqr_9(r, a, m, mp);
+    for (; n > 1; n--) {
+        sp_256_mont_sqr_9(r, r, m, mp);
+    }
+}
+
+#endif /* !WOLFSSL_SP_SMALL | HAVE_COMP_KEY */
+#ifdef WOLFSSL_SP_SMALL
+/* Mod-2 for the P256 curve. */
+static const uint32_t p256_mod_minus_2[8] = {
+    0xfffffffdU,0xffffffffU,0xffffffffU,0x00000000U,0x00000000U,0x00000000U,
+    0x00000001U,0xffffffffU
+};
+#endif /* !WOLFSSL_SP_SMALL */
+
+/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
+ * P256 curve. (r = 1 / a mod m)
+ *
+ * r   Inverse result.
+ * a   Number to invert.
+ * td  Temporary data.
+ */
+static void sp_256_mont_inv_9(sp_digit* r, const sp_digit* a, sp_digit* td)
+{
+#ifdef WOLFSSL_SP_SMALL
+    sp_digit* t = td;
+    int i;
+
+    XMEMCPY(t, a, sizeof(sp_digit) * 9);
+    for (i=254; i>=0; i--) {
+        sp_256_mont_sqr_9(t, t, p256_mod, p256_mp_mod);
+        if (p256_mod_minus_2[i / 32] & ((sp_digit)1 << (i % 32)))
+            sp_256_mont_mul_9(t, t, a, p256_mod, p256_mp_mod);
+    }
+    XMEMCPY(r, t, sizeof(sp_digit) * 9);
+#else
+    sp_digit* t1 = td;
+    sp_digit* t2 = td + 2 * 9;
+    sp_digit* t3 = td + 4 * 9;
+    /* 0x2 */
+    sp_256_mont_sqr_9(t1, a, p256_mod, p256_mp_mod);
+    /* 0x3 */
+    sp_256_mont_mul_9(t2, t1, a, p256_mod, p256_mp_mod);
+    /* 0xc */
+    sp_256_mont_sqr_n_9(t1, t2, 2, p256_mod, p256_mp_mod);
+    /* 0xd */
+    sp_256_mont_mul_9(t3, t1, a, p256_mod, p256_mp_mod);
+    /* 0xf */
+    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
+    /* 0xf0 */
+    sp_256_mont_sqr_n_9(t1, t2, 4, p256_mod, p256_mp_mod);
+    /* 0xfd */
+    sp_256_mont_mul_9(t3, t3, t1, p256_mod, p256_mp_mod);
+    /* 0xff */
+    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
+    /* 0xff00 */
+    sp_256_mont_sqr_n_9(t1, t2, 8, p256_mod, p256_mp_mod);
+    /* 0xfffd */
+    sp_256_mont_mul_9(t3, t3, t1, p256_mod, p256_mp_mod);
+    /* 0xffff */
+    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
+    /* 0xffff0000 */
+    sp_256_mont_sqr_n_9(t1, t2, 16, p256_mod, p256_mp_mod);
+    /* 0xfffffffd */
+    sp_256_mont_mul_9(t3, t3, t1, p256_mod, p256_mp_mod);
+    /* 0xffffffff */
+    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
+    /* 0xffffffff00000000 */
+    sp_256_mont_sqr_n_9(t1, t2, 32, p256_mod, p256_mp_mod);
+    /* 0xffffffffffffffff */
+    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
+    /* 0xffffffff00000001 */
+    sp_256_mont_mul_9(r, t1, a, p256_mod, p256_mp_mod);
+    /* 0xffffffff000000010000000000000000000000000000000000000000 */
+    sp_256_mont_sqr_n_9(r, r, 160, p256_mod, p256_mp_mod);
+    /* 0xffffffff00000001000000000000000000000000ffffffffffffffff */
+    sp_256_mont_mul_9(r, r, t2, p256_mod, p256_mp_mod);
+    /* 0xffffffff00000001000000000000000000000000ffffffffffffffff00000000 */
+    sp_256_mont_sqr_n_9(r, r, 32, p256_mod, p256_mp_mod);
+    /* 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffd */
+    sp_256_mont_mul_9(r, r, t3, p256_mod, p256_mp_mod);
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+/* Map the Montgomery form projective coordinate point to an affine point.
+ *
+ * r  Resulting affine coordinate point.
+ * p  Montgomery form projective coordinate point.
+ * t  Temporary ordinate data.
+ */
+static void sp_256_map_9(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_int32 n;
+
+    sp_256_mont_inv_9(t1, p->z, t + 2*9);
+
+    sp_256_mont_sqr_9(t2, t1, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t1, t2, t1, p256_mod, p256_mp_mod);
+
+    /* x /= z^2 */
+    sp_256_mont_mul_9(r->x, p->x, t2, p256_mod, p256_mp_mod);
+    XMEMSET(r->x + 9, 0, sizeof(r->x) / 2U);
+    sp_256_mont_reduce_9(r->x, p256_mod, p256_mp_mod);
+    /* Reduce x to less than modulus */
+    n = sp_256_cmp_9(r->x, p256_mod);
+    sp_256_cond_sub_9(r->x, r->x, p256_mod, 0 - ((n >= 0) ?
+                (sp_digit)1 : (sp_digit)0));
+    sp_256_norm_9(r->x);
+
+    /* y /= z^3 */
+    sp_256_mont_mul_9(r->y, p->y, t1, p256_mod, p256_mp_mod);
+    XMEMSET(r->y + 9, 0, sizeof(r->y) / 2U);
+    sp_256_mont_reduce_9(r->y, p256_mod, p256_mp_mod);
+    /* Reduce y to less than modulus */
+    n = sp_256_cmp_9(r->y, p256_mod);
+    sp_256_cond_sub_9(r->y, r->y, p256_mod, 0 - ((n >= 0) ?
+                (sp_digit)1 : (sp_digit)0));
+    sp_256_norm_9(r->y);
+
+    XMEMSET(r->z, 0, sizeof(r->z));
+    r->z[0] = 1;
+
+}
+
+/* Add two Montgomery form numbers (r = a + b % m).
+ *
+ * r   Result of addition.
+ * a   First number to add in Montogmery form.
+ * b   Second number to add in Montogmery form.
+ * m   Modulus (prime).
+ */
+static void sp_256_mont_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b,
+        const sp_digit* m)
+{
+    (void)sp_256_add_9(r, a, b);
+    sp_256_norm_9(r);
+    sp_256_cond_sub_9(r, r, m, 0 - (((r[8] >> 24) > 0) ?
+                (sp_digit)1 : (sp_digit)0));
+    sp_256_norm_9(r);
+}
+
+/* Double a Montgomery form number (r = a + a % m).
+ *
+ * r   Result of doubling.
+ * a   Number to double in Montogmery form.
+ * m   Modulus (prime).
+ */
+static void sp_256_mont_dbl_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    (void)sp_256_add_9(r, a, a);
+    sp_256_norm_9(r);
+    sp_256_cond_sub_9(r, r, m, 0 - (((r[8] >> 24) > 0) ?
+                (sp_digit)1 : (sp_digit)0));
+    sp_256_norm_9(r);
+}
+
+/* Triple a Montgomery form number (r = a + a + a % m).
+ *
+ * r   Result of Tripling.
+ * a   Number to triple in Montogmery form.
+ * m   Modulus (prime).
+ */
+static void sp_256_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    (void)sp_256_add_9(r, a, a);
+    sp_256_norm_9(r);
+    sp_256_cond_sub_9(r, r, m, 0 - (((r[8] >> 24) > 0) ?
+                (sp_digit)1 : (sp_digit)0));
+    sp_256_norm_9(r);
+    (void)sp_256_add_9(r, r, a);
+    sp_256_norm_9(r);
+    sp_256_cond_sub_9(r, r, m, 0 - (((r[8] >> 24) > 0) ?
+                (sp_digit)1 : (sp_digit)0));
+    sp_256_norm_9(r);
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_256_cond_add_9(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+
+    for (i = 0; i < 9; i++) {
+        r[i] = a[i] + (b[i] & m);
+    }
+#else
+    r[ 0] = a[ 0] + (b[ 0] & m);
+    r[ 1] = a[ 1] + (b[ 1] & m);
+    r[ 2] = a[ 2] + (b[ 2] & m);
+    r[ 3] = a[ 3] + (b[ 3] & m);
+    r[ 4] = a[ 4] + (b[ 4] & m);
+    r[ 5] = a[ 5] + (b[ 5] & m);
+    r[ 6] = a[ 6] + (b[ 6] & m);
+    r[ 7] = a[ 7] + (b[ 7] & m);
+    r[ 8] = a[ 8] + (b[ 8] & m);
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+/* Subtract two Montgomery form numbers (r = a - b % m).
+ *
+ * r   Result of subtration.
+ * a   Number to subtract from in Montogmery form.
+ * b   Number to subtract with in Montogmery form.
+ * m   Modulus (prime).
+ */
+static void sp_256_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b,
+        const sp_digit* m)
+{
+    (void)sp_256_sub_9(r, a, b);
+    sp_256_norm_9(r);
+    sp_256_cond_add_9(r, r, m, r[8] >> 24);
+    sp_256_norm_9(r);
+}
+
+/* Shift number left one bit.
+ * Bottom bit is lost.
+ *
+ * r  Result of shift.
+ * a  Number to shift.
+ */
+SP_NOINLINE static void sp_256_rshift1_9(sp_digit* r, const sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+
+    for (i=0; i<8; i++) {
+        r[i] = (a[i] >> 1) + ((a[i + 1] << 28) & 0x1fffffff);
+    }
+#else
+    r[0] = (a[0] >> 1) + ((a[1] << 28) & 0x1fffffff);
+    r[1] = (a[1] >> 1) + ((a[2] << 28) & 0x1fffffff);
+    r[2] = (a[2] >> 1) + ((a[3] << 28) & 0x1fffffff);
+    r[3] = (a[3] >> 1) + ((a[4] << 28) & 0x1fffffff);
+    r[4] = (a[4] >> 1) + ((a[5] << 28) & 0x1fffffff);
+    r[5] = (a[5] >> 1) + ((a[6] << 28) & 0x1fffffff);
+    r[6] = (a[6] >> 1) + ((a[7] << 28) & 0x1fffffff);
+    r[7] = (a[7] >> 1) + ((a[8] << 28) & 0x1fffffff);
+#endif
+    r[8] = a[8] >> 1;
+}
+
+/* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
+ *
+ * r  Result of division by 2.
+ * a  Number to divide.
+ * m  Modulus (prime).
+ */
+static void sp_256_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    sp_256_cond_add_9(r, a, m, 0 - (a[0] & 1));
+    sp_256_norm_9(r);
+    sp_256_rshift1_9(r, r);
+}
+
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
+#ifdef WOLFSSL_SP_NONBLOCK
+typedef struct sp_256_proj_point_dbl_9_ctx {
+    int state;
+    sp_digit* t1;
+    sp_digit* t2;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+} sp_256_proj_point_dbl_9_ctx;
+
+static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
+{
+    int err = FP_WOULDBLOCK;
+    sp_256_proj_point_dbl_9_ctx* ctx = (sp_256_proj_point_dbl_9_ctx*)sp_ctx->data;
+
+    typedef char ctx_size_test[sizeof(sp_256_proj_point_dbl_9_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
+    (void)sizeof(ctx_size_test);
+
+    switch (ctx->state) {
+    case 0:
+        ctx->t1 = t;
+        ctx->t2 = t + 2*9;
+        ctx->x = r->x;
+        ctx->y = r->y;
+        ctx->z = r->z;
+
+        /* Put infinity into result. */
+        if (r != p) {
+            r->infinity = p->infinity;
+        }
+        ctx->state = 1;
+        break;
+    case 1:
+        /* T1 = Z * Z */
+        sp_256_mont_sqr_9(ctx->t1, p->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
+        break;
+    case 2:
+        /* Z = Y * Z */
+        sp_256_mont_mul_9(ctx->z, p->y, p->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
+        break;
+    case 3:
+        /* Z = 2Z */
+        sp_256_mont_dbl_9(ctx->z, ctx->z, p256_mod);
+        ctx->state = 4;
+        break;
+    case 4:
+        /* T2 = X - T1 */
+        sp_256_mont_sub_9(ctx->t2, p->x, ctx->t1, p256_mod);
+        ctx->state = 5;
+        break;
+    case 5:
+        /* T1 = X + T1 */
+        sp_256_mont_add_9(ctx->t1, p->x, ctx->t1, p256_mod);
+        ctx->state = 6;
+        break;
+    case 6:
+        /* T2 = T1 * T2 */
+        sp_256_mont_mul_9(ctx->t2, ctx->t1, ctx->t2, p256_mod, p256_mp_mod);
+        ctx->state = 7;
+        break;
+    case 7:
+        /* T1 = 3T2 */
+        sp_256_mont_tpl_9(ctx->t1, ctx->t2, p256_mod);
+        ctx->state = 8;
+        break;
+    case 8:
+        /* Y = 2Y */
+        sp_256_mont_dbl_9(ctx->y, p->y, p256_mod);
+        ctx->state = 9;
+        break;
+    case 9:
+        /* Y = Y * Y */
+        sp_256_mont_sqr_9(ctx->y, ctx->y, p256_mod, p256_mp_mod);
+        ctx->state = 10;
+        break;
+    case 10:
+        /* T2 = Y * Y */
+        sp_256_mont_sqr_9(ctx->t2, ctx->y, p256_mod, p256_mp_mod);
+        ctx->state = 11;
+        break;
+    case 11:
+        /* T2 = T2/2 */
+        sp_256_div2_9(ctx->t2, ctx->t2, p256_mod);
+        ctx->state = 12;
+        break;
+    case 12:
+        /* Y = Y * X */
+        sp_256_mont_mul_9(ctx->y, ctx->y, p->x, p256_mod, p256_mp_mod);
+        ctx->state = 13;
+        break;
+    case 13:
+        /* X = T1 * T1 */
+        sp_256_mont_sqr_9(ctx->x, ctx->t1, p256_mod, p256_mp_mod);
+        ctx->state = 14;
+        break;
+    case 14:
+        /* X = X - Y */
+        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->y, p256_mod);
+        ctx->state = 15;
+        break;
+    case 15:
+        /* X = X - Y */
+        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->y, p256_mod);
+        ctx->state = 16;
+        break;
+    case 16:
+        /* Y = Y - X */
+        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->x, p256_mod);
+        ctx->state = 17;
+        break;
+    case 17:
+        /* Y = Y * T1 */
+        sp_256_mont_mul_9(ctx->y, ctx->y, ctx->t1, p256_mod, p256_mp_mod);
+        ctx->state = 18;
+        break;
+    case 18:
+        /* Y = Y - T2 */
+        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->t2, p256_mod);
+        ctx->state = 19;
+        /* fall-through */
+    case 19:
+        err = MP_OKAY;
+        break;
+    }
+
+    if (err == MP_OKAY && ctx->state != 19) {
+        err = FP_WOULDBLOCK;
+    }
+
+    return err;
+}
+#endif /* WOLFSSL_SP_NONBLOCK */
+
+static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_9(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_9(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_9(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_9(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_9(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_9(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_9(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_9(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_9(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_9(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_9(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_9(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_9(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_9(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_9(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_9(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_9(y, y, t2, p256_mod);
+}
+
+/* Compare two numbers to determine if they are equal.
+ * Constant time implementation.
+ *
+ * a  First number to compare.
+ * b  Second number to compare.
+ * returns 1 when equal and 0 otherwise.
+ */
+static int sp_256_cmp_equal_9(const sp_digit* a, const sp_digit* b)
+{
+    return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) |
+            (a[3] ^ b[3]) | (a[4] ^ b[4]) | (a[5] ^ b[5]) |
+            (a[6] ^ b[6]) | (a[7] ^ b[7]) | (a[8] ^ b[8])) == 0;
+}
+
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
+
+#ifdef WOLFSSL_SP_NONBLOCK
+typedef struct sp_256_proj_point_add_9_ctx {
+    int state;
+    sp_256_proj_point_dbl_9_ctx dbl_ctx;
+    const sp_point_256* ap[2];
+    sp_point_256* rp[2];
+    sp_digit* t1;
+    sp_digit* t2;
+    sp_digit* t3;
+    sp_digit* t4;
+    sp_digit* t5;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+} sp_256_proj_point_add_9_ctx;
+
+static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    int err = FP_WOULDBLOCK;
+    sp_256_proj_point_add_9_ctx* ctx = (sp_256_proj_point_add_9_ctx*)sp_ctx->data;
+
+    /* Ensure only the first point is the same as the result. */
+    if (q == r) {
+        const sp_point_256* a = p;
+        p = q;
+        q = a;
+    }
+
+    typedef char ctx_size_test[sizeof(sp_256_proj_point_add_9_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
+    (void)sizeof(ctx_size_test);
+
+    switch (ctx->state) {
+    case 0: /* INIT */
+        ctx->t1 = t;
+        ctx->t2 = t + 2*9;
+        ctx->t3 = t + 4*9;
+        ctx->t4 = t + 6*9;
+        ctx->t5 = t + 8*9;
+
+        ctx->state = 1;
+        break;
+    case 1:
+        /* Check double */
+        (void)sp_256_sub_9(ctx->t1, p256_mod, q->y);
+        sp_256_norm_9(ctx->t1);
+        if ((sp_256_cmp_equal_9(p->x, q->x) & sp_256_cmp_equal_9(p->z, q->z) &
+            (sp_256_cmp_equal_9(p->y, q->y) | sp_256_cmp_equal_9(p->y, ctx->t1))) != 0)
+        {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            ctx->state = 2;
+        }
+        else {
+            ctx->state = 3;
+        }
+        break;
+    case 2:
+        err = sp_256_proj_point_dbl_9_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
+        if (err == MP_OKAY)
+            ctx->state = 27; /* done */
+        break;
+    case 3:
+    {
+        int i;
+        ctx->rp[0] = r;
+
+        /*lint allow cast to different type of pointer*/
+        ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/
+        XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256));
+        ctx->x = ctx->rp[p->infinity | q->infinity]->x;
+        ctx->y = ctx->rp[p->infinity | q->infinity]->y;
+        ctx->z = ctx->rp[p->infinity | q->infinity]->z;
+
+        ctx->ap[0] = p;
+        ctx->ap[1] = q;
+        for (i=0; i<9; i++) {
+            r->x[i] = ctx->ap[p->infinity]->x[i];
+        }
+        for (i=0; i<9; i++) {
+            r->y[i] = ctx->ap[p->infinity]->y[i];
+        }
+        for (i=0; i<9; i++) {
+            r->z[i] = ctx->ap[p->infinity]->z[i];
+        }
+        r->infinity = ctx->ap[p->infinity]->infinity;
+
+        ctx->state = 4;
+        break;
+    }
+    case 4:
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_9(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 5;
+        break;
+    case 5:
+        sp_256_mont_mul_9(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 6;
+        break;
+    case 6:
+        sp_256_mont_mul_9(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod);
+        ctx->state = 7;
+        break;
+    case 7:
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_9(ctx->t2, ctx->z, p256_mod, p256_mp_mod);
+        ctx->state = 8;
+        break;
+    case 8:
+        sp_256_mont_mul_9(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod);
+        ctx->state = 9;
+        break;
+    case 9:
+        sp_256_mont_mul_9(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
+        ctx->state = 10;
+        break;
+    case 10:
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_9(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod);
+        ctx->state = 11;
+        break;
+    case 11:
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_9(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        ctx->state = 12;
+        break;
+    case 12:
+        /* H = U2 - U1 */
+        sp_256_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        ctx->state = 13;
+        break;
+    case 13:
+        /* R = S2 - S1 */
+        sp_256_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        ctx->state = 14;
+        break;
+    case 14:
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_9(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 15;
+        break;
+    case 15:
+        sp_256_mont_mul_9(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod);
+        ctx->state = 16;
+        break;
+    case 16:
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_9(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        ctx->state = 17;
+        break;
+    case 17:
+        sp_256_mont_sqr_9(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        ctx->state = 18;
+        break;
+    case 18:
+        sp_256_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        ctx->state = 19;
+        break;
+    case 19:
+        sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        ctx->state = 20;
+        break;
+    case 20:
+        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t5, p256_mod);
+        ctx->state = 21;
+        break;
+    case 21:
+        sp_256_mont_dbl_9(ctx->t1, ctx->y, p256_mod);
+        ctx->state = 22;
+        break;
+    case 22:
+        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t1, p256_mod);
+        ctx->state = 23;
+        break;
+    case 23:
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->x, p256_mod);
+        ctx->state = 24;
+        break;
+    case 24:
+        sp_256_mont_mul_9(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
+        ctx->state = 25;
+        break;
+    case 25:
+        sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        ctx->state = 26;
+        break;
+    case 26:
+        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->t5, p256_mod);
+        ctx->state = 27;
+        /* fall-through */
+    case 27:
+        err = MP_OKAY;
+        break;
+    }
+
+    if (err == MP_OKAY && ctx->state != 27) {
+        err = FP_WOULDBLOCK;
+    }
+    return err;
+}
+#endif /* WOLFSSL_SP_NONBLOCK */
+
+static void sp_256_proj_point_add_9(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    const sp_point_256* ap[2];
+    sp_point_256* rp[2];
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* t3 = t + 4*9;
+    sp_digit* t4 = t + 6*9;
+    sp_digit* t5 = t + 8*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+    int i;
+
+    /* Ensure only the first point is the same as the result. */
+    if (q == r) {
+        const sp_point_256* a = p;
+        p = q;
+        q = a;
+    }
+
+    /* Check double */
+    (void)sp_256_sub_9(t1, p256_mod, q->y);
+    sp_256_norm_9(t1);
+    if ((sp_256_cmp_equal_9(p->x, q->x) & sp_256_cmp_equal_9(p->z, q->z) &
+        (sp_256_cmp_equal_9(p->y, q->y) | sp_256_cmp_equal_9(p->y, t1))) != 0) {
+        sp_256_proj_point_dbl_9(r, p, t);
+    }
+    else {
+        rp[0] = r;
+
+        /*lint allow cast to different type of pointer*/
+        rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/
+        XMEMSET(rp[1], 0, sizeof(sp_point_256));
+        x = rp[p->infinity | q->infinity]->x;
+        y = rp[p->infinity | q->infinity]->y;
+        z = rp[p->infinity | q->infinity]->z;
+
+        ap[0] = p;
+        ap[1] = q;
+        for (i=0; i<9; i++) {
+            r->x[i] = ap[p->infinity]->x[i];
+        }
+        for (i=0; i<9; i++) {
+            r->y[i] = ap[p->infinity]->y[i];
+        }
+        for (i=0; i<9; i++) {
+            r->z[i] = ap[p->infinity]->z[i];
+        }
+        r->infinity = ap[p->infinity]->infinity;
+
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t3, t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t1, t1, x, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_9(t2, z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t4, t2, z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_9(t3, t3, y, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_9(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_9(t4, t4, t3, p256_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_9(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(z, z, t2, p256_mod, p256_mp_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_9(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(x, x, t5, p256_mod);
+        sp_256_mont_dbl_9(t1, y, p256_mod);
+        sp_256_mont_sub_9(x, x, t1, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_9(y, y, x, p256_mod);
+        sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(y, y, t5, p256_mod);
+    }
+}
+
+/* Multiply a number by Montogmery normalizer mod modulus (prime).
+ *
+ * r  The resulting Montgomery form number.
+ * a  The number to convert.
+ * m  The modulus (prime).
+ * returns MEMORY_E when memory allocation fails and MP_OKAY otherwise.
+ */
+static int sp_256_mod_mul_norm_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    int64_t* t = NULL;
+#else
+    int64_t t[2 * 8];
+#endif
+    int64_t* a32 = NULL;
+    int64_t o;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t = (int64_t*)XMALLOC(sizeof(int64_t) * 2 * 8, NULL, DYNAMIC_TYPE_ECC);
+    if (t == NULL)
+        return MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        a32 = t + 8;
+
+        a32[0] = a[0];
+        a32[0] |= a[1] << 29U;
+        a32[0] &= 0xffffffffL;
+        a32[1] = (a[1] >> 3);
+        a32[1] |= a[2] << 26U;
+        a32[1] &= 0xffffffffL;
+        a32[2] = (a[2] >> 6);
+        a32[2] |= a[3] << 23U;
+        a32[2] &= 0xffffffffL;
+        a32[3] = (a[3] >> 9);
+        a32[3] |= a[4] << 20U;
+        a32[3] &= 0xffffffffL;
+        a32[4] = (a[4] >> 12);
+        a32[4] |= a[5] << 17U;
+        a32[4] &= 0xffffffffL;
+        a32[5] = (a[5] >> 15);
+        a32[5] |= a[6] << 14U;
+        a32[5] &= 0xffffffffL;
+        a32[6] = (a[6] >> 18);
+        a32[6] |= a[7] << 11U;
+        a32[6] &= 0xffffffffL;
+        a32[7] = (a[7] >> 21);
+        a32[7] |= a[8] << 8U;
+        a32[7] &= 0xffffffffL;
+
+        /*  1  1  0 -1 -1 -1 -1  0 */
+        t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6];
+        /*  0  1  1  0 -1 -1 -1 -1 */
+        t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7];
+        /*  0  0  1  1  0 -1 -1 -1 */
+        t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7];
+        /* -1 -1  0  2  2  1  0 -1 */
+        t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7];
+        /*  0 -1 -1  0  2  2  1  0 */
+        t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6];
+        /*  0  0 -1 -1  0  2  2  1 */
+        t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7];
+        /* -1 -1  0  0  0  1  3  2 */
+        t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7];
+        /*  1  0 -1 -1 -1 -1  0  3 */
+        t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7];
+
+        t[1] += t[0] >> 32U; t[0] &= 0xffffffffL;
+        t[2] += t[1] >> 32U; t[1] &= 0xffffffffL;
+        t[3] += t[2] >> 32U; t[2] &= 0xffffffffL;
+        t[4] += t[3] >> 32U; t[3] &= 0xffffffffL;
+        t[5] += t[4] >> 32U; t[4] &= 0xffffffffL;
+        t[6] += t[5] >> 32U; t[5] &= 0xffffffffL;
+        t[7] += t[6] >> 32U; t[6] &= 0xffffffffL;
+        o     = t[7] >> 32U; t[7] &= 0xffffffffL;
+        t[0] += o;
+        t[3] -= o;
+        t[6] -= o;
+        t[7] += o;
+        t[1] += t[0] >> 32U; t[0] &= 0xffffffffL;
+        t[2] += t[1] >> 32U; t[1] &= 0xffffffffL;
+        t[3] += t[2] >> 32U; t[2] &= 0xffffffffL;
+        t[4] += t[3] >> 32U; t[3] &= 0xffffffffL;
+        t[5] += t[4] >> 32U; t[4] &= 0xffffffffL;
+        t[6] += t[5] >> 32U; t[5] &= 0xffffffffL;
+        t[7] += t[6] >> 32U; t[6] &= 0xffffffffL;
+
+        r[0] = (sp_digit)(t[0]) & 0x1fffffffL;
+        r[1] = (sp_digit)(t[0] >> 29U);
+        r[1] |= (sp_digit)(t[1] << 3U);
+        r[1] &= 0x1fffffffL;
+        r[2] = (sp_digit)(t[1] >> 26U);
+        r[2] |= (sp_digit)(t[2] << 6U);
+        r[2] &= 0x1fffffffL;
+        r[3] = (sp_digit)(t[2] >> 23U);
+        r[3] |= (sp_digit)(t[3] << 9U);
+        r[3] &= 0x1fffffffL;
+        r[4] = (sp_digit)(t[3] >> 20U);
+        r[4] |= (sp_digit)(t[4] << 12U);
+        r[4] &= 0x1fffffffL;
+        r[5] = (sp_digit)(t[4] >> 17U);
+        r[5] |= (sp_digit)(t[5] << 15U);
+        r[5] &= 0x1fffffffL;
+        r[6] = (sp_digit)(t[5] >> 14U);
+        r[6] |= (sp_digit)(t[6] << 18U);
+        r[6] &= 0x1fffffffL;
+        r[7] = (sp_digit)(t[6] >> 11U);
+        r[7] |= (sp_digit)(t[7] << 21U);
+        r[7] &= 0x1fffffffL;
+        r[8] = (sp_digit)(t[7] >> 8U);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t != NULL)
+        XFREE(t, NULL, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+
+#ifdef WOLFSSL_SP_SMALL
+/* Multiply the point by the scalar and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * Small implementation using add and double that is cache attack resistant but
+ * allocates memory rather than use large stacks.
+ * 256 adds and doubles.
+ *
+ * r     Resulting point.
+ * g     Point to multiply.
+ * k     Scalar to multiply by.
+ * map   Indicates whether to convert result to affine.
+ * ct    Constant time required.
+ * heap  Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+
+#ifdef WOLFSSL_SP_NONBLOCK
+typedef struct sp_256_ecc_mulmod_9_ctx {
+    int state;
+    union {
+        sp_256_proj_point_dbl_9_ctx dbl_ctx;
+        sp_256_proj_point_add_9_ctx add_ctx;
+    };
+    sp_point_256 t[3];
+    sp_digit tmp[2 * 9 * 5];
+    sp_digit n;
+    int i;
+    int c;
+    int y;
+} sp_256_ecc_mulmod_9_ctx;
+
+static int sp_256_ecc_mulmod_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
+    const sp_point_256* g, const sp_digit* k, int map, int ct, void* heap)
+{
+    int err = FP_WOULDBLOCK;
+    sp_256_ecc_mulmod_9_ctx* ctx = (sp_256_ecc_mulmod_9_ctx*)sp_ctx->data;
+
+    typedef char ctx_size_test[sizeof(sp_256_ecc_mulmod_9_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
+    (void)sizeof(ctx_size_test);
+
+    /* Implementation is constant time. */
+    (void)ct;
+
+    switch (ctx->state) {
+    case 0: /* INIT */
+        XMEMSET(ctx->t, 0, sizeof(sp_point_256) * 3);
+        ctx->i = 8;
+        ctx->c = 24;
+        ctx->n = k[ctx->i--] << (29 - ctx->c);
+
+        /* t[0] = {0, 0, 1} * norm */
+        ctx->t[0].infinity = 1;
+        ctx->state = 1;
+        break;
+    case 1: /* T1X */
+        /* t[1] = {g->x, g->y, g->z} * norm */
+        err = sp_256_mod_mul_norm_9(ctx->t[1].x, g->x, p256_mod);
+        ctx->state = 2;
+        break;
+    case 2: /* T1Y */
+        err = sp_256_mod_mul_norm_9(ctx->t[1].y, g->y, p256_mod);
+        ctx->state = 3;
+        break;
+    case 3: /* T1Z */
+        err = sp_256_mod_mul_norm_9(ctx->t[1].z, g->z, p256_mod);
+        ctx->state = 4;
+        break;
+    case 4: /* ADDPREP */
+        if (ctx->c == 0) {
+            if (ctx->i == -1) {
+                ctx->state = 7;
+                break;
+            }
+
+            ctx->n = k[ctx->i--];
+            ctx->c = 29;
+        }
+        ctx->y = (ctx->n >> 28) & 1;
+        ctx->n <<= 1;
+        XMEMSET(&ctx->add_ctx, 0, sizeof(ctx->add_ctx));
+        ctx->state = 5;
+        break;
+    case 5: /* ADD */
+        err = sp_256_proj_point_add_9_nb((sp_ecc_ctx_t*)&ctx->add_ctx,
+            &ctx->t[ctx->y^1], &ctx->t[0], &ctx->t[1], ctx->tmp);
+        if (err == MP_OKAY) {
+            XMEMCPY(&ctx->t[2], (void*)(((size_t)&ctx->t[0] & addr_mask[ctx->y^1]) +
+                                        ((size_t)&ctx->t[1] & addr_mask[ctx->y])),
+                    sizeof(sp_point_256));
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            ctx->state = 6;
+        }
+        break;
+    case 6: /* DBL */
+        err = sp_256_proj_point_dbl_9_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, &ctx->t[2],
+            &ctx->t[2], ctx->tmp);
+        if (err == MP_OKAY) {
+            XMEMCPY((void*)(((size_t)&ctx->t[0] & addr_mask[ctx->y^1]) +
+                            ((size_t)&ctx->t[1] & addr_mask[ctx->y])), &ctx->t[2],
+                    sizeof(sp_point_256));
+            ctx->state = 4;
+            ctx->c--;
+        }
+        break;
+    case 7: /* MAP */
+        if (map != 0) {
+            sp_256_map_9(r, &ctx->t[0], ctx->tmp);
+        }
+        else {
+            XMEMCPY(r, &ctx->t[0], sizeof(sp_point_256));
+        }
+        err = MP_OKAY;
+        break;
+    }
+
+    if (err == MP_OKAY && ctx->state != 7) {
+        err = FP_WOULDBLOCK;
+    }
+    if (err != FP_WOULDBLOCK) {
+        ForceZero(ctx->tmp, sizeof(ctx->tmp));
+        ForceZero(ctx->t, sizeof(ctx->t));
+    }
+
+    (void)heap;
+
+    return err;
+}
+
+#endif /* WOLFSSL_SP_NONBLOCK */
+
+static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* t = NULL;
+    sp_digit* tmp = NULL;
+#else
+    sp_point_256 t[3];
+    sp_digit tmp[2 * 9 * 5];
+#endif
+    sp_digit n;
+    int i;
+    int c;
+    int y;
+    int err = MP_OKAY;
+
+    /* Implementation is constant time. */
+    (void)ct;
+    (void)heap;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 3, heap,
+                                     DYNAMIC_TYPE_ECC);
+    if (t == NULL)
+        err = MEMORY_E;
+    if (err == MP_OKAY) {
+        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap,
+                                 DYNAMIC_TYPE_ECC);
+        if (tmp == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        XMEMSET(t, 0, sizeof(sp_point_256) * 3);
+
+        /* t[0] = {0, 0, 1} * norm */
+        t[0].infinity = 1;
+        /* t[1] = {g->x, g->y, g->z} * norm */
+        err = sp_256_mod_mul_norm_9(t[1].x, g->x, p256_mod);
+    }
+    if (err == MP_OKAY)
+        err = sp_256_mod_mul_norm_9(t[1].y, g->y, p256_mod);
+    if (err == MP_OKAY)
+        err = sp_256_mod_mul_norm_9(t[1].z, g->z, p256_mod);
+
+    if (err == MP_OKAY) {
+        i = 8;
+        c = 24;
+        n = k[i--] << (29 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1)
+                    break;
+
+                n = k[i--];
+                c = 29;
+            }
+
+            y = (n >> 28) & 1;
+            n <<= 1;
+
+            sp_256_proj_point_add_9(&t[y^1], &t[0], &t[1], tmp);
+
+            XMEMCPY(&t[2], (void*)(((size_t)&t[0] & addr_mask[y^1]) +
+                                   ((size_t)&t[1] & addr_mask[y])),
+                    sizeof(sp_point_256));
+            sp_256_proj_point_dbl_9(&t[2], &t[2], tmp);
+            XMEMCPY((void*)(((size_t)&t[0] & addr_mask[y^1]) +
+                            ((size_t)&t[1] & addr_mask[y])), &t[2],
+                    sizeof(sp_point_256));
+        }
+
+        if (map != 0) {
+            sp_256_map_9(r, &t[0], tmp);
+        }
+        else {
+            XMEMCPY(r, &t[0], sizeof(sp_point_256));
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (tmp != NULL)
+#endif
+    {
+        ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 5);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
+    #endif
+    }
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t != NULL)
+#endif
+    {
+        ForceZero(t, sizeof(sp_point_256) * 3);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(t, heap, DYNAMIC_TYPE_ECC);
+    #endif
+    }
+
+    return err;
+}
+
+#else
+/* A table entry for pre-computed points. */
+typedef struct sp_table_entry_256 {
+    sp_digit x[9];
+    sp_digit y[9];
+} sp_table_entry_256;
+
+/* Conditionally copy a into r using the mask m.
+ * m is -1 to copy and 0 when not.
+ *
+ * r  A single precision number to copy over.
+ * a  A single precision number to copy.
+ * m  Mask value to apply.
+ */
+static void sp_256_cond_copy_9(sp_digit* r, const sp_digit* a, const sp_digit m)
+{
+    sp_digit t[9];
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+
+    for (i = 0; i < 9; i++) {
+        t[i] = r[i] ^ a[i];
+    }
+    for (i = 0; i < 9; i++) {
+        r[i] ^= t[i] & m;
+    }
+#else
+    t[ 0] = r[ 0] ^ a[ 0];
+    t[ 1] = r[ 1] ^ a[ 1];
+    t[ 2] = r[ 2] ^ a[ 2];
+    t[ 3] = r[ 3] ^ a[ 3];
+    t[ 4] = r[ 4] ^ a[ 4];
+    t[ 5] = r[ 5] ^ a[ 5];
+    t[ 6] = r[ 6] ^ a[ 6];
+    t[ 7] = r[ 7] ^ a[ 7];
+    t[ 8] = r[ 8] ^ a[ 8];
+    r[ 0] ^= t[ 0] & m;
+    r[ 1] ^= t[ 1] & m;
+    r[ 2] ^= t[ 2] & m;
+    r[ 3] ^= t[ 3] & m;
+    r[ 4] ^= t[ 4] & m;
+    r[ 5] ^= t[ 5] & m;
+    r[ 6] ^= t[ 6] & m;
+    r[ 7] ^= t[ 7] & m;
+    r[ 8] ^= t[ 8] & m;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+/* Double the Montgomery form projective point p a number of times.
+ *
+ * r  Result of repeated doubling of point.
+ * p  Point to double.
+ * n  Number of times to double
+ * t  Temporary ordinate data.
+ */
+static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int n,
+    sp_digit* t)
+{
+    sp_digit* w = t;
+    sp_digit* a = t + 2*9;
+    sp_digit* b = t + 4*9;
+    sp_digit* t1 = t + 6*9;
+    sp_digit* t2 = t + 8*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = p->x;
+    y = p->y;
+    z = p->z;
+
+    /* Y = 2*Y */
+    sp_256_mont_dbl_9(y, y, p256_mod);
+    /* W = Z^4 */
+    sp_256_mont_sqr_9(w, z, p256_mod, p256_mp_mod);
+    sp_256_mont_sqr_9(w, w, p256_mod, p256_mp_mod);
+
+#ifndef WOLFSSL_SP_SMALL
+    while (--n > 0)
+#else
+    while (--n >= 0)
+#endif
+    {
+        /* A = 3*(X^2 - W) */
+        sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(t1, t1, w, p256_mod);
+        sp_256_mont_tpl_9(a, t1, p256_mod);
+        /* B = X*Y^2 */
+        sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod);
+        /* X = A^2 - 2B */
+        sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_9(t2, b, p256_mod);
+        sp_256_mont_sub_9(x, x, t2, p256_mod);
+        /* Z = Z*Y */
+        sp_256_mont_mul_9(z, z, y, p256_mod, p256_mp_mod);
+        /* t2 = Y^4 */
+        sp_256_mont_sqr_9(t1, t1, p256_mod, p256_mp_mod);
+#ifdef WOLFSSL_SP_SMALL
+        if (n != 0)
+#endif
+        {
+            /* W = W*Y^4 */
+            sp_256_mont_mul_9(w, w, t1, p256_mod, p256_mp_mod);
+        }
+        /* y = 2*A*(B - X) - Y^4 */
+        sp_256_mont_sub_9(y, b, x, p256_mod);
+        sp_256_mont_mul_9(y, y, a, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_9(y, y, p256_mod);
+        sp_256_mont_sub_9(y, y, t1, p256_mod);
+    }
+#ifndef WOLFSSL_SP_SMALL
+    /* A = 3*(X^2 - W) */
+    sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod);
+    sp_256_mont_sub_9(t1, t1, w, p256_mod);
+    sp_256_mont_tpl_9(a, t1, p256_mod);
+    /* B = X*Y^2 */
+    sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod);
+    /* X = A^2 - 2B */
+    sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
+    sp_256_mont_dbl_9(t2, b, p256_mod);
+    sp_256_mont_sub_9(x, x, t2, p256_mod);
+    /* Z = Z*Y */
+    sp_256_mont_mul_9(z, z, y, p256_mod, p256_mp_mod);
+    /* t2 = Y^4 */
+    sp_256_mont_sqr_9(t1, t1, p256_mod, p256_mp_mod);
+    /* y = 2*A*(B - X) - Y^4 */
+    sp_256_mont_sub_9(y, b, x, p256_mod);
+    sp_256_mont_mul_9(y, y, a, p256_mod, p256_mp_mod);
+    sp_256_mont_dbl_9(y, y, p256_mod);
+    sp_256_mont_sub_9(y, y, t1, p256_mod);
+#endif
+    /* Y = Y/2 */
+    sp_256_div2_9(y, y, p256_mod);
+}
+
+/* Double the Montgomery form projective point p a number of times.
+ *
+ * r  Result of repeated doubling of point.
+ * p  Point to double.
+ * n  Number of times to double
+ * t  Temporary ordinate data.
+ */
+static void sp_256_proj_point_dbl_n_store_9(sp_point_256* r,
+        const sp_point_256* p, int n, int m, sp_digit* t)
+{
+    sp_digit* w = t;
+    sp_digit* a = t + 2*9;
+    sp_digit* b = t + 4*9;
+    sp_digit* t1 = t + 6*9;
+    sp_digit* t2 = t + 8*9;
+    sp_digit* x = r[2*m].x;
+    sp_digit* y = r[(1<<n)*m].y;
+    sp_digit* z = r[2*m].z;
+    int i;
+    int j;
+
+    for (i=0; i<9; i++) {
+        x[i] = p->x[i];
+    }
+    for (i=0; i<9; i++) {
+        y[i] = p->y[i];
+    }
+    for (i=0; i<9; i++) {
+        z[i] = p->z[i];
+    }
+
+    /* Y = 2*Y */
+    sp_256_mont_dbl_9(y, y, p256_mod);
+    /* W = Z^4 */
+    sp_256_mont_sqr_9(w, z, p256_mod, p256_mp_mod);
+    sp_256_mont_sqr_9(w, w, p256_mod, p256_mp_mod);
+    j = m;
+    for (i=1; i<=n; i++) {
+        j *= 2;
+
+        /* A = 3*(X^2 - W) */
+        sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(t1, t1, w, p256_mod);
+        sp_256_mont_tpl_9(a, t1, p256_mod);
+        /* B = X*Y^2 */
+        sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(b, t2, x, p256_mod, p256_mp_mod);
+        x = r[j].x;
+        /* X = A^2 - 2B */
+        sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_9(t1, b, p256_mod);
+        sp_256_mont_sub_9(x, x, t1, p256_mod);
+        /* Z = Z*Y */
+        sp_256_mont_mul_9(r[j].z, z, y, p256_mod, p256_mp_mod);
+        z = r[j].z;
+        /* t2 = Y^4 */
+        sp_256_mont_sqr_9(t2, t2, p256_mod, p256_mp_mod);
+        if (i != n) {
+            /* W = W*Y^4 */
+            sp_256_mont_mul_9(w, w, t2, p256_mod, p256_mp_mod);
+        }
+        /* y = 2*A*(B - X) - Y^4 */
+        sp_256_mont_sub_9(y, b, x, p256_mod);
+        sp_256_mont_mul_9(y, y, a, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_9(y, y, p256_mod);
+        sp_256_mont_sub_9(y, y, t2, p256_mod);
+
+        /* Y = Y/2 */
+        sp_256_div2_9(r[j].y, y, p256_mod);
+        r[j].infinity = 0;
+    }
+}
+
+/* Add two Montgomery form projective points.
+ *
+ * ra  Result of addition.
+ * rs  Result of subtraction.
+ * p   First point to add.
+ * q   Second point to add.
+ * t   Temporary ordinate data.
+ */
+static void sp_256_proj_point_add_sub_9(sp_point_256* ra,
+        sp_point_256* rs, const sp_point_256* p, const sp_point_256* q,
+        sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* t3 = t + 4*9;
+    sp_digit* t4 = t + 6*9;
+    sp_digit* t5 = t + 8*9;
+    sp_digit* t6 = t + 10*9;
+    sp_digit* x = ra->x;
+    sp_digit* y = ra->y;
+    sp_digit* z = ra->z;
+    sp_digit* xs = rs->x;
+    sp_digit* ys = rs->y;
+    sp_digit* zs = rs->z;
+
+
+    XMEMCPY(x, p->x, sizeof(p->x) / 2);
+    XMEMCPY(y, p->y, sizeof(p->y) / 2);
+    XMEMCPY(z, p->z, sizeof(p->z) / 2);
+    ra->infinity = 0;
+    rs->infinity = 0;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t1, t1, x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_9(t2, z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t4, t2, z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_9(t3, t3, y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);
+    /* H = U2 - U1 */
+    sp_256_mont_sub_9(t2, t2, t1, p256_mod);
+    /* RS = S2 + S1 */
+    sp_256_mont_add_9(t6, t4, t3, p256_mod);
+    /* R = S2 - S1 */
+    sp_256_mont_sub_9(t4, t4, t3, p256_mod);
+    /* Z3 = H*Z1*Z2 */
+    /* ZS = H*Z1*Z2 */
+    sp_256_mont_mul_9(z, z, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(z, z, t2, p256_mod, p256_mp_mod);
+    XMEMCPY(zs, z, sizeof(p->z)/2);
+    /* X3 = R^2 - H^3 - 2*U1*H^2 */
+    /* XS = RS^2 - H^3 - 2*U1*H^2 */
+    sp_256_mont_sqr_9(x, t4, p256_mod, p256_mp_mod);
+    sp_256_mont_sqr_9(xs, t6, p256_mod, p256_mp_mod);
+    sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(y, t1, t5, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod);
+    sp_256_mont_sub_9(x, x, t5, p256_mod);
+    sp_256_mont_sub_9(xs, xs, t5, p256_mod);
+    sp_256_mont_dbl_9(t1, y, p256_mod);
+    sp_256_mont_sub_9(x, x, t1, p256_mod);
+    sp_256_mont_sub_9(xs, xs, t1, p256_mod);
+    /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+    /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */
+    sp_256_mont_sub_9(ys, y, xs, p256_mod);
+    sp_256_mont_sub_9(y, y, x, p256_mod);
+    sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod);
+    sp_256_sub_9(t6, p256_mod, t6);
+    sp_256_mont_mul_9(ys, ys, t6, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t5, t5, t3, p256_mod, p256_mp_mod);
+    sp_256_mont_sub_9(y, y, t5, p256_mod);
+    sp_256_mont_sub_9(ys, ys, t5, p256_mod);
+}
+
+/* Structure used to describe recoding of scalar multiplication. */
+typedef struct ecc_recode_256 {
+    /* Index into pre-computation table. */
+    uint8_t i;
+    /* Use the negative of the point. */
+    uint8_t neg;
+} ecc_recode_256;
+
+/* The index into pre-computation table to use. */
+static const uint8_t recode_index_9_6[66] = {
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+     0,  1,
+};
+
+/* Whether to negate y-ordinate. */
+static const uint8_t recode_neg_9_6[66] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+     0,  0,
+};
+
+/* Recode the scalar for multiplication using pre-computed values and
+ * subtraction.
+ *
+ * k  Scalar to multiply by.
+ * v  Vector of operations to perform.
+ */
+static void sp_256_ecc_recode_6_9(const sp_digit* k, ecc_recode_256* v)
+{
+    int i;
+    int j;
+    uint8_t y;
+    int carry = 0;
+    int o;
+    sp_digit n;
+
+    j = 0;
+    n = k[j];
+    o = 0;
+    for (i=0; i<43; i++) {
+        y = (int8_t)n;
+        if (o + 6 < 29) {
+            y &= 0x3f;
+            n >>= 6;
+            o += 6;
+        }
+        else if (o + 6 == 29) {
+            n >>= 6;
+            if (++j < 9)
+                n = k[j];
+            o = 0;
+        }
+        else if (++j < 9) {
+            n = k[j];
+            y |= (uint8_t)((n << (29 - o)) & 0x3f);
+            o -= 23;
+            n >>= o;
+        }
+
+        y += (uint8_t)carry;
+        v[i].i = recode_index_9_6[y];
+        v[i].neg = recode_neg_9_6[y];
+        carry = (y >> 6) + v[i].neg;
+    }
+}
+
+#ifndef WC_NO_CACHE_RESISTANT
+/* Touch each possible point that could be being copied.
+ *
+ * r      Point to copy into.
+ * table  Table - start of the entires to access
+ * idx    Index of entry to retrieve.
+ */
+static void sp_256_get_point_33_9(sp_point_256* r, const sp_point_256* table,
+    int idx)
+{
+    int i;
+    sp_digit mask;
+
+    r->x[0] = 0;
+    r->x[1] = 0;
+    r->x[2] = 0;
+    r->x[3] = 0;
+    r->x[4] = 0;
+    r->x[5] = 0;
+    r->x[6] = 0;
+    r->x[7] = 0;
+    r->x[8] = 0;
+    r->y[0] = 0;
+    r->y[1] = 0;
+    r->y[2] = 0;
+    r->y[3] = 0;
+    r->y[4] = 0;
+    r->y[5] = 0;
+    r->y[6] = 0;
+    r->y[7] = 0;
+    r->y[8] = 0;
+    r->z[0] = 0;
+    r->z[1] = 0;
+    r->z[2] = 0;
+    r->z[3] = 0;
+    r->z[4] = 0;
+    r->z[5] = 0;
+    r->z[6] = 0;
+    r->z[7] = 0;
+    r->z[8] = 0;
+    for (i = 1; i < 33; i++) {
+        mask = 0 - (i == idx);
+        r->x[0] |= mask & table[i].x[0];
+        r->x[1] |= mask & table[i].x[1];
+        r->x[2] |= mask & table[i].x[2];
+        r->x[3] |= mask & table[i].x[3];
+        r->x[4] |= mask & table[i].x[4];
+        r->x[5] |= mask & table[i].x[5];
+        r->x[6] |= mask & table[i].x[6];
+        r->x[7] |= mask & table[i].x[7];
+        r->x[8] |= mask & table[i].x[8];
+        r->y[0] |= mask & table[i].y[0];
+        r->y[1] |= mask & table[i].y[1];
+        r->y[2] |= mask & table[i].y[2];
+        r->y[3] |= mask & table[i].y[3];
+        r->y[4] |= mask & table[i].y[4];
+        r->y[5] |= mask & table[i].y[5];
+        r->y[6] |= mask & table[i].y[6];
+        r->y[7] |= mask & table[i].y[7];
+        r->y[8] |= mask & table[i].y[8];
+        r->z[0] |= mask & table[i].z[0];
+        r->z[1] |= mask & table[i].z[1];
+        r->z[2] |= mask & table[i].z[2];
+        r->z[3] |= mask & table[i].z[3];
+        r->z[4] |= mask & table[i].z[4];
+        r->z[5] |= mask & table[i].z[5];
+        r->z[6] |= mask & table[i].z[6];
+        r->z[7] |= mask & table[i].z[7];
+        r->z[8] |= mask & table[i].z[8];
+    }
+}
+#endif /* !WC_NO_CACHE_RESISTANT */
+/* Multiply the point by the scalar and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * Window technique of 6 bits. (Add-Sub variation.)
+ * Calculate 0..32 times the point. Use function that adds and
+ * subtracts the same two points.
+ * Recode to add or subtract one of the computed points.
+ * Double to push up.
+ * NOT a sliding window.
+ *
+ * r     Resulting point.
+ * g     Point to multiply.
+ * k     Scalar to multiply by.
+ * map   Indicates whether to convert result to affine.
+ * ct    Constant time required.
+ * heap  Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+static int sp_256_ecc_mulmod_win_add_sub_9(sp_point_256* r, const sp_point_256* g,
+        const sp_digit* k, int map, int ct, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* t = NULL;
+    sp_digit* tmp = NULL;
+#else
+    sp_point_256 t[33+2];
+    sp_digit tmp[2 * 9 * 6];
+#endif
+    sp_point_256* rt = NULL;
+    sp_point_256* p = NULL;
+    sp_digit* negy;
+    int i;
+    ecc_recode_256 v[43];
+    int err = MP_OKAY;
+
+    /* Constant time used for cache attack resistance implementation. */
+    (void)ct;
+    (void)heap;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 
+        (33+2), heap, DYNAMIC_TYPE_ECC);
+    if (t == NULL)
+        err = MEMORY_E;
+    if (err == MP_OKAY) {
+        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6,
+                                 heap, DYNAMIC_TYPE_ECC);
+        if (tmp == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        rt = t + 33;
+        p  = t + 33+1;
+
+        /* t[0] = {0, 0, 1} * norm */
+        XMEMSET(&t[0], 0, sizeof(t[0]));
+        t[0].infinity = 1;
+        /* t[1] = {g->x, g->y, g->z} * norm */
+        err = sp_256_mod_mul_norm_9(t[1].x, g->x, p256_mod);
+    }
+    if (err == MP_OKAY) {
+        err = sp_256_mod_mul_norm_9(t[1].y, g->y, p256_mod);
+    }
+    if (err == MP_OKAY) {
+        err = sp_256_mod_mul_norm_9(t[1].z, g->z, p256_mod);
+    }
+
+    if (err == MP_OKAY) {
+        t[1].infinity = 0;
+        /* t[2] ... t[32]  */
+        sp_256_proj_point_dbl_n_store_9(t, &t[ 1], 5, 1, tmp);
+        sp_256_proj_point_add_9(&t[ 3], &t[ 2], &t[ 1], tmp);
+        sp_256_proj_point_dbl_9(&t[ 6], &t[ 3], tmp);
+        sp_256_proj_point_add_sub_9(&t[ 7], &t[ 5], &t[ 6], &t[ 1], tmp);
+        sp_256_proj_point_dbl_9(&t[10], &t[ 5], tmp);
+        sp_256_proj_point_add_sub_9(&t[11], &t[ 9], &t[10], &t[ 1], tmp);
+        sp_256_proj_point_dbl_9(&t[12], &t[ 6], tmp);
+        sp_256_proj_point_dbl_9(&t[14], &t[ 7], tmp);
+        sp_256_proj_point_add_sub_9(&t[15], &t[13], &t[14], &t[ 1], tmp);
+        sp_256_proj_point_dbl_9(&t[18], &t[ 9], tmp);
+        sp_256_proj_point_add_sub_9(&t[19], &t[17], &t[18], &t[ 1], tmp);
+        sp_256_proj_point_dbl_9(&t[20], &t[10], tmp);
+        sp_256_proj_point_dbl_9(&t[22], &t[11], tmp);
+        sp_256_proj_point_add_sub_9(&t[23], &t[21], &t[22], &t[ 1], tmp);
+        sp_256_proj_point_dbl_9(&t[24], &t[12], tmp);
+        sp_256_proj_point_dbl_9(&t[26], &t[13], tmp);
+        sp_256_proj_point_add_sub_9(&t[27], &t[25], &t[26], &t[ 1], tmp);
+        sp_256_proj_point_dbl_9(&t[28], &t[14], tmp);
+        sp_256_proj_point_dbl_9(&t[30], &t[15], tmp);
+        sp_256_proj_point_add_sub_9(&t[31], &t[29], &t[30], &t[ 1], tmp);
+
+        negy = t[0].y;
+
+        sp_256_ecc_recode_6_9(k, v);
+
+        i = 42;
+    #ifndef WC_NO_CACHE_RESISTANT
+        if (ct) {
+            sp_256_get_point_33_9(rt, t, v[i].i);
+            rt->infinity = !v[i].i;
+        }
+        else
+    #endif
+        {
+            XMEMCPY(rt, &t[v[i].i], sizeof(sp_point_256));
+        }
+        for (--i; i>=0; i--) {
+            sp_256_proj_point_dbl_n_9(rt, 6, tmp);
+
+        #ifndef WC_NO_CACHE_RESISTANT
+            if (ct) {
+                sp_256_get_point_33_9(p, t, v[i].i);
+                p->infinity = !v[i].i;
+            }
+            else
+        #endif
+            {
+                XMEMCPY(p, &t[v[i].i], sizeof(sp_point_256));
+            }
+            sp_256_sub_9(negy, p256_mod, p->y);
+            sp_256_norm_9(negy);
+            sp_256_cond_copy_9(p->y, negy, (sp_digit)0 - v[i].neg);
+            sp_256_proj_point_add_9(rt, rt, p, tmp);
+        }
+
+        if (map != 0) {
+            sp_256_map_9(r, rt, tmp);
+        }
+        else {
+            XMEMCPY(r, rt, sizeof(sp_point_256));
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t != NULL)
+        XFREE(t, heap, DYNAMIC_TYPE_ECC);
+    if (tmp != NULL)
+        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+
+#ifdef FP_ECC
+#endif /* FP_ECC */
+/* Add two Montgomery form projective points. The second point has a q value of
+ * one.
+ * Only the first point can be the same pointer as the result point.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
+static void sp_256_proj_point_add_qz1_9(sp_point_256* r, const sp_point_256* p,
+        const sp_point_256* q, sp_digit* t)
+{
+    const sp_point_256* ap[2];
+    sp_point_256* rp[2];
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* t3 = t + 4*9;
+    sp_digit* t4 = t + 6*9;
+    sp_digit* t5 = t + 8*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+    int i;
+
+    /* Check double */
+    (void)sp_256_sub_9(t1, p256_mod, q->y);
+    sp_256_norm_9(t1);
+    if ((sp_256_cmp_equal_9(p->x, q->x) & sp_256_cmp_equal_9(p->z, q->z) &
+        (sp_256_cmp_equal_9(p->y, q->y) | sp_256_cmp_equal_9(p->y, t1))) != 0) {
+        sp_256_proj_point_dbl_9(r, p, t);
+    }
+    else {
+        rp[0] = r;
+
+        /*lint allow cast to different type of pointer*/
+        rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/
+        XMEMSET(rp[1], 0, sizeof(sp_point_256));
+        x = rp[p->infinity | q->infinity]->x;
+        y = rp[p->infinity | q->infinity]->y;
+        z = rp[p->infinity | q->infinity]->z;
+
+        ap[0] = p;
+        ap[1] = q;
+        for (i=0; i<9; i++) {
+            r->x[i] = ap[p->infinity]->x[i];
+        }
+        for (i=0; i<9; i++) {
+            r->y[i] = ap[p->infinity]->y[i];
+        }
+        for (i=0; i<9; i++) {
+            r->z[i] = ap[p->infinity]->z[i];
+        }
+        r->infinity = ap[p->infinity]->infinity;
+
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_9(t2, z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t4, t2, z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);
+        /* H = U2 - X1 */
+        sp_256_mont_sub_9(t2, t2, x, p256_mod);
+        /* R = S2 - Y1 */
+        sp_256_mont_sub_9(t4, t4, y, p256_mod);
+        /* Z3 = H*Z1 */
+        sp_256_mont_mul_9(z, z, t2, p256_mod, p256_mp_mod);
+        /* X3 = R^2 - H^3 - 2*X1*H^2 */
+        sp_256_mont_sqr_9(t1, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t3, x, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(x, t1, t5, p256_mod);
+        sp_256_mont_dbl_9(t1, t3, p256_mod);
+        sp_256_mont_sub_9(x, x, t1, p256_mod);
+        /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
+        sp_256_mont_sub_9(t3, t3, x, p256_mod);
+        sp_256_mont_mul_9(t3, t3, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t5, t5, y, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(y, t3, t5, p256_mod);
+    }
+}
+
+#ifdef FP_ECC
+/* Convert the projective point to affine.
+ * Ordinates are in Montgomery form.
+ *
+ * a  Point to convert.
+ * t  Temporary data.
+ */
+static void sp_256_proj_to_affine_9(sp_point_256* a, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2 * 9;
+    sp_digit* tmp = t + 4 * 9;
+
+    sp_256_mont_inv_9(t1, a->z, tmp);
+
+    sp_256_mont_sqr_9(t2, t1, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t1, t2, t1, p256_mod, p256_mp_mod);
+
+    sp_256_mont_mul_9(a->x, a->x, t2, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(a->y, a->y, t1, p256_mod, p256_mp_mod);
+    XMEMCPY(a->z, p256_norm_mod, sizeof(p256_norm_mod));
+}
+
+/* Generate the pre-computed table of points for the base point.
+ *
+ * width = 8
+ * 256 entries
+ * 32 bits between
+ *
+ * a      The base point.
+ * table  Place to store generated point data.
+ * tmp    Temporary data.
+ * heap  Heap to use for allocation.
+ */
+static int sp_256_gen_stripe_table_9(const sp_point_256* a,
+        sp_table_entry_256* table, sp_digit* tmp, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* t = NULL;
+#else
+    sp_point_256 t[3];
+#endif
+    sp_point_256* s1 = NULL;
+    sp_point_256* s2 = NULL;
+    int i;
+    int j;
+    int err = MP_OKAY;
+
+    (void)heap;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 3, heap,
+                                     DYNAMIC_TYPE_ECC);
+    if (t == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        s1 = t + 1;
+        s2 = t + 2;
+
+        err = sp_256_mod_mul_norm_9(t->x, a->x, p256_mod);
+    }
+    if (err == MP_OKAY) {
+        err = sp_256_mod_mul_norm_9(t->y, a->y, p256_mod);
+    }
+    if (err == MP_OKAY) {
+        err = sp_256_mod_mul_norm_9(t->z, a->z, p256_mod);
+    }
+    if (err == MP_OKAY) {
+        t->infinity = 0;
+        sp_256_proj_to_affine_9(t, tmp);
+
+        XMEMCPY(s1->z, p256_norm_mod, sizeof(p256_norm_mod));
+        s1->infinity = 0;
+        XMEMCPY(s2->z, p256_norm_mod, sizeof(p256_norm_mod));
+        s2->infinity = 0;
+
+        /* table[0] = {0, 0, infinity} */
+        XMEMSET(&table[0], 0, sizeof(sp_table_entry_256));
+        /* table[1] = Affine version of 'a' in Montgomery form */
+        XMEMCPY(table[1].x, t->x, sizeof(table->x));
+        XMEMCPY(table[1].y, t->y, sizeof(table->y));
+
+        for (i=1; i<8; i++) {
+            sp_256_proj_point_dbl_n_9(t, 32, tmp);
+            sp_256_proj_to_affine_9(t, tmp);
+            XMEMCPY(table[1<<i].x, t->x, sizeof(table->x));
+            XMEMCPY(table[1<<i].y, t->y, sizeof(table->y));
+        }
+
+        for (i=1; i<8; i++) {
+            XMEMCPY(s1->x, table[1<<i].x, sizeof(table->x));
+            XMEMCPY(s1->y, table[1<<i].y, sizeof(table->y));
+            for (j=(1<<i)+1; j<(1<<(i+1)); j++) {
+                XMEMCPY(s2->x, table[j-(1<<i)].x, sizeof(table->x));
+                XMEMCPY(s2->y, table[j-(1<<i)].y, sizeof(table->y));
+                sp_256_proj_point_add_qz1_9(t, s1, s2, tmp);
+                sp_256_proj_to_affine_9(t, tmp);
+                XMEMCPY(table[j].x, t->x, sizeof(table->x));
+                XMEMCPY(table[j].y, t->y, sizeof(table->y));
+            }
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t != NULL)
+        XFREE(t, heap, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+
+#endif /* FP_ECC */
+#ifndef WC_NO_CACHE_RESISTANT
+/* Touch each possible entry that could be being copied.
+ *
+ * r      Point to copy into.
+ * table  Table - start of the entires to access
+ * idx    Index of entry to retrieve.
+ */
+static void sp_256_get_entry_256_9(sp_point_256* r,
+    const sp_table_entry_256* table, int idx)
+{
+    int i;
+    sp_digit mask;
+
+    r->x[0] = 0;
+    r->x[1] = 0;
+    r->x[2] = 0;
+    r->x[3] = 0;
+    r->x[4] = 0;
+    r->x[5] = 0;
+    r->x[6] = 0;
+    r->x[7] = 0;
+    r->x[8] = 0;
+    r->y[0] = 0;
+    r->y[1] = 0;
+    r->y[2] = 0;
+    r->y[3] = 0;
+    r->y[4] = 0;
+    r->y[5] = 0;
+    r->y[6] = 0;
+    r->y[7] = 0;
+    r->y[8] = 0;
+    for (i = 1; i < 256; i++) {
+        mask = 0 - (i == idx);
+        r->x[0] |= mask & table[i].x[0];
+        r->x[1] |= mask & table[i].x[1];
+        r->x[2] |= mask & table[i].x[2];
+        r->x[3] |= mask & table[i].x[3];
+        r->x[4] |= mask & table[i].x[4];
+        r->x[5] |= mask & table[i].x[5];
+        r->x[6] |= mask & table[i].x[6];
+        r->x[7] |= mask & table[i].x[7];
+        r->x[8] |= mask & table[i].x[8];
+        r->y[0] |= mask & table[i].y[0];
+        r->y[1] |= mask & table[i].y[1];
+        r->y[2] |= mask & table[i].y[2];
+        r->y[3] |= mask & table[i].y[3];
+        r->y[4] |= mask & table[i].y[4];
+        r->y[5] |= mask & table[i].y[5];
+        r->y[6] |= mask & table[i].y[6];
+        r->y[7] |= mask & table[i].y[7];
+        r->y[8] |= mask & table[i].y[8];
+    }
+}
+#endif /* !WC_NO_CACHE_RESISTANT */
+/* Multiply the point by the scalar and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * Stripe implementation.
+ * Pre-generated: 2^0, 2^32, ...
+ * Pre-generated: products of all combinations of above.
+ * 8 doubles and adds (with qz=1)
+ *
+ * r      Resulting point.
+ * k      Scalar to multiply by.
+ * table  Pre-computed table.
+ * map    Indicates whether to convert result to affine.
+ * ct     Constant time required.
+ * heap   Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+static int sp_256_ecc_mulmod_stripe_9(sp_point_256* r, const sp_point_256* g,
+        const sp_table_entry_256* table, const sp_digit* k, int map,
+        int ct, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* rt = NULL;
+    sp_digit* t = NULL;
+#else
+    sp_point_256 rt[2];
+    sp_digit t[2 * 9 * 5];
+#endif
+    sp_point_256* p = NULL;
+    int i;
+    int j;
+    int y;
+    int x;
+    int err = MP_OKAY;
+
+    (void)g;
+    /* Constant time used for cache attack resistance implementation. */
+    (void)ct;
+    (void)heap;
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    rt = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
+                                      DYNAMIC_TYPE_ECC);
+    if (rt == NULL)
+        err = MEMORY_E;
+    if (err == MP_OKAY) {
+        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap,
+                               DYNAMIC_TYPE_ECC);
+        if (t == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        p = rt + 1;
+
+        XMEMCPY(p->z, p256_norm_mod, sizeof(p256_norm_mod));
+        XMEMCPY(rt->z, p256_norm_mod, sizeof(p256_norm_mod));
+
+        y = 0;
+        x = 31;
+        for (j=0; j<8; j++) {
+            y |= (int)(((k[x / 29] >> (x % 29)) & 1) << j);
+            x += 32;
+        }
+    #ifndef WC_NO_CACHE_RESISTANT
+        if (ct) {
+            sp_256_get_entry_256_9(rt, table, y);
+        } else
+    #endif
+        {
+            XMEMCPY(rt->x, table[y].x, sizeof(table[y].x));
+            XMEMCPY(rt->y, table[y].y, sizeof(table[y].y));
+        }
+        rt->infinity = !y;
+        for (i=30; i>=0; i--) {
+            y = 0;
+            x = i;
+            for (j=0; j<8; j++) {
+                y |= (int)(((k[x / 29] >> (x % 29)) & 1) << j);
+                x += 32;
+            }
+
+            sp_256_proj_point_dbl_9(rt, rt, t);
+        #ifndef WC_NO_CACHE_RESISTANT
+            if (ct) {
+                sp_256_get_entry_256_9(p, table, y);
+            }
+            else
+        #endif
+            {
+                XMEMCPY(p->x, table[y].x, sizeof(table[y].x));
+                XMEMCPY(p->y, table[y].y, sizeof(table[y].y));
+            }
+            p->infinity = !y;
+            sp_256_proj_point_add_qz1_9(rt, rt, p, t);
+        }
+
+        if (map != 0) {
+            sp_256_map_9(r, rt, t);
+        }
+        else {
+            XMEMCPY(r, rt, sizeof(sp_point_256));
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t != NULL)
+        XFREE(t, heap, DYNAMIC_TYPE_ECC);
+    if (rt != NULL)
+        XFREE(rt, heap, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+
+#ifdef FP_ECC
+#ifndef FP_ENTRIES
+    #define FP_ENTRIES 16
+#endif
+
+/* Cache entry - holds precomputation tables for a point. */
+typedef struct sp_cache_256_t {
+    /* X ordinate of point that table was generated from. */
+    sp_digit x[9];
+    /* Y ordinate of point that table was generated from. */
+    sp_digit y[9];
+    /* Precomputation table for point. */
+    sp_table_entry_256 table[256];
+    /* Count of entries in table. */
+    uint32_t cnt;
+    /* Point and table set in entry. */
+    int set;
+} sp_cache_256_t;
+
+/* Cache of tables. */
+static THREAD_LS_T sp_cache_256_t sp_cache_256[FP_ENTRIES];
+/* Index of last entry in cache. */
+static THREAD_LS_T int sp_cache_256_last = -1;
+/* Cache has been initialized. */
+static THREAD_LS_T int sp_cache_256_inited = 0;
+
+#ifndef HAVE_THREAD_LS
+    static volatile int initCacheMutex_256 = 0;
+    static wolfSSL_Mutex sp_cache_256_lock;
+#endif
+
+/* Get the cache entry for the point.
+ *
+ * g      [in]   Point scalar multipling.
+ * cache  [out]  Cache table to use.
+ */
+static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
+{
+    int i;
+    int j;
+    uint32_t least;
+
+    if (sp_cache_256_inited == 0) {
+        for (i=0; i<FP_ENTRIES; i++) {
+            sp_cache_256[i].set = 0;
+        }
+        sp_cache_256_inited = 1;
+    }
+
+    /* Compare point with those in cache. */
+    for (i=0; i<FP_ENTRIES; i++) {
+        if (!sp_cache_256[i].set)
+            continue;
+
+        if (sp_256_cmp_equal_9(g->x, sp_cache_256[i].x) &
+                           sp_256_cmp_equal_9(g->y, sp_cache_256[i].y)) {
+            sp_cache_256[i].cnt++;
+            break;
+        }
+    }
+
+    /* No match. */
+    if (i == FP_ENTRIES) {
+        /* Find empty entry. */
+        i = (sp_cache_256_last + 1) % FP_ENTRIES;
+        for (; i != sp_cache_256_last; i=(i+1)%FP_ENTRIES) {
+            if (!sp_cache_256[i].set) {
+                break;
+            }
+        }
+
+        /* Evict least used. */
+        if (i == sp_cache_256_last) {
+            least = sp_cache_256[0].cnt;
+            for (j=1; j<FP_ENTRIES; j++) {
+                if (sp_cache_256[j].cnt < least) {
+                    i = j;
+                    least = sp_cache_256[i].cnt;
+                }
+            }
+        }
+
+        XMEMCPY(sp_cache_256[i].x, g->x, sizeof(sp_cache_256[i].x));
+        XMEMCPY(sp_cache_256[i].y, g->y, sizeof(sp_cache_256[i].y));
+        sp_cache_256[i].set = 1;
+        sp_cache_256[i].cnt = 1;
+    }
+
+    *cache = &sp_cache_256[i];
+    sp_cache_256_last = i;
+}
+#endif /* FP_ECC */
+
+/* Multiply the base point of P256 by the scalar and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * r     Resulting point.
+ * g     Point to multiply.
+ * k     Scalar to multiply by.
+ * map   Indicates whether to convert result to affine.
+ * ct    Constant time required.
+ * heap  Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g, const sp_digit* k,
+        int map, int ct, void* heap)
+{
+#ifndef FP_ECC
+    return sp_256_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
+#else
+    sp_digit tmp[2 * 9 * 5];
+    sp_cache_256_t* cache;
+    int err = MP_OKAY;
+
+#ifndef HAVE_THREAD_LS
+    if (initCacheMutex_256 == 0) {
+         wc_InitMutex(&sp_cache_256_lock);
+         initCacheMutex_256 = 1;
+    }
+    if (wc_LockMutex(&sp_cache_256_lock) != 0)
+       err = BAD_MUTEX_E;
+#endif /* HAVE_THREAD_LS */
+
+    if (err == MP_OKAY) {
+        sp_ecc_get_cache_256(g, &cache);
+        if (cache->cnt == 2)
+            sp_256_gen_stripe_table_9(g, cache->table, tmp, heap);
+
+#ifndef HAVE_THREAD_LS
+        wc_UnLockMutex(&sp_cache_256_lock);
+#endif /* HAVE_THREAD_LS */
+
+        if (cache->cnt < 2) {
+            err = sp_256_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
+        }
+        else {
+            err = sp_256_ecc_mulmod_stripe_9(r, g, cache->table, k,
+                    map, ct, heap);
+        }
+    }
+
+    return err;
+#endif
+}
+
+#endif
+/* Multiply the point by the scalar and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * km    Scalar to multiply by.
+ * p     Point to multiply.
+ * r     Resulting point.
+ * map   Indicates whether to convert result to affine.
+ * heap  Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+int sp_ecc_mulmod_256(const mp_int* km, const ecc_point* gm, ecc_point* r,
+        int map, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* point = NULL;
+    sp_digit* k = NULL;
+#else
+    sp_point_256 point[1];
+    sp_digit k[9];
+#endif
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
+                                         DYNAMIC_TYPE_ECC);
+    if (point == NULL)
+        err = MEMORY_E;
+    if (err == MP_OKAY) {
+        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
+                               DYNAMIC_TYPE_ECC);
+        if (k == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        sp_256_from_mp(k, 9, km);
+        sp_256_point_from_ecc_point_9(point, gm);
+
+            err = sp_256_ecc_mulmod_9(point, point, k, map, 1, heap);
+    }
+    if (err == MP_OKAY) {
+        err = sp_256_point_to_ecc_point_9(point, r);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (k != NULL)
+        XFREE(k, heap, DYNAMIC_TYPE_ECC);
+    if (point != NULL)
+        XFREE(point, heap, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+
+/* Multiply the point by the scalar, add point a and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * km      Scalar to multiply by.
+ * p       Point to multiply.
+ * am      Point to add to scalar mulitply result.
+ * inMont  Point to add is in montogmery form.
+ * r       Resulting point.
+ * map     Indicates whether to convert result to affine.
+ * heap    Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm,
+    const ecc_point* am, int inMont, ecc_point* r, int map, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* point = NULL;    
+    sp_digit* k = NULL;
+#else
+    sp_point_256 point[2];
+    sp_digit k[9 + 9 * 2 * 5];
+#endif
+    sp_point_256* addP = NULL;
+    sp_digit* tmp = NULL;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
+                                         DYNAMIC_TYPE_ECC);
+    if (point == NULL)
+        err = MEMORY_E;
+    if (err == MP_OKAY) {
+        k = (sp_digit*)XMALLOC(
+            sizeof(sp_digit) * (9 + 9 * 2 * 5), heap,
+            DYNAMIC_TYPE_ECC);
+        if (k == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        addP = point + 1;
+        tmp = k + 9;
+
+        sp_256_from_mp(k, 9, km);
+        sp_256_point_from_ecc_point_9(point, gm);
+        sp_256_point_from_ecc_point_9(addP, am);
+    }
+    if ((err == MP_OKAY) && (!inMont)) {
+        err = sp_256_mod_mul_norm_9(addP->x, addP->x, p256_mod);
+    }
+    if ((err == MP_OKAY) && (!inMont)) {
+        err = sp_256_mod_mul_norm_9(addP->y, addP->y, p256_mod);
+    }
+    if ((err == MP_OKAY) && (!inMont)) {
+        err = sp_256_mod_mul_norm_9(addP->z, addP->z, p256_mod);
+    }
+    if (err == MP_OKAY) {
+            err = sp_256_ecc_mulmod_9(point, point, k, 0, 0, heap);
+    }
+    if (err == MP_OKAY) {
+            sp_256_proj_point_add_9(point, point, addP, tmp);
+
+        if (map) {
+                sp_256_map_9(point, point, tmp);
+        }
+
+        err = sp_256_point_to_ecc_point_9(point, r);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (k != NULL)
+        XFREE(k, heap, DYNAMIC_TYPE_ECC);
+    if (point != NULL)
+        XFREE(point, heap, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+
+#ifdef WOLFSSL_SP_SMALL
+/* Multiply the base point of P256 by the scalar and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * r     Resulting point.
+ * k     Scalar to multiply by.
+ * map   Indicates whether to convert result to affine.
+ * heap  Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+static int sp_256_ecc_mulmod_base_9(sp_point_256* r, const sp_digit* k,
+        int map, int ct, void* heap)
+{
+    /* No pre-computed values. */
+    return sp_256_ecc_mulmod_9(r, &p256_base, k, map, ct, heap);
+}
+
+#else
+/* Striping precomputation table.
+ * 8 points combined into a table of 256 points.
+ * Distance of 32 between points.
+ */
+static const sp_table_entry_256 p256_table[256] = {
+    /* 0 */
+    { { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } },
+    /* 1 */
+    { { 0x18a9143c,0x0f3986a0,0x1b6d805e,0x152bf8bf,0x0251075b,0x1995bbb1,
+        0x1719e7ed,0x0ed4a6ea,0x0018905f },
+      { 0x0e95560a,0x0f929abe,0x06791737,0x1571c974,0x1f3258b4,0x03446e90,
+        0x16174ba2,0x0304b10b,0x008571ff } },
+    /* 2 */
+    { { 0x0147519a,0x01443012,0x0cdcbc08,0x103d584d,0x1ebc8d09,0x13e553c2,
+        0x03a6a752,0x01bb7beb,0x00d953c5 },
+      { 0x1d590f8f,0x0b1b0e67,0x19b245e7,0x12c4d689,0x164cf72e,0x10881175,
+        0x03cdff65,0x0fd3d651,0x00863ebb } },
+    /* 3 */
+    { { 0x1cdb6485,0x02b5b11a,0x028be5de,0x1e1d445e,0x0300b808,0x0caa27bf,
+        0x0280f9a3,0x0ab6bff0,0x00000760 },
+      { 0x038d2010,0x11a75cdc,0x10dc229d,0x029f7664,0x06606540,0x1e9cc215,
+        0x1b838391,0x0c2686e7,0x00830877 } },
+    /* 4 */
+    { { 0x16a0d2bb,0x1c917e28,0x188d2653,0x1982d834,0x02c8b0d5,0x079d2be3,
+        0x19fe4907,0x0c3fa36c,0x002f5e69 },
+      { 0x15a01797,0x00ae385f,0x05586497,0x01689ac1,0x1db523d2,0x0d9b838f,
+        0x1dec1244,0x02d1ade1,0x00f648f9 } },
+    /* 5 */
+    { { 0x0137bbbc,0x12b3423f,0x1a82fb27,0x088d3d14,0x13463e43,0x13b0bceb,
+        0x0056c710,0x10a267a0,0x005abe02 },
+      { 0x004c7dab,0x15541be6,0x098301e4,0x1b3e9886,0x0cc37573,0x0ab13c73,
+        0x0e0c324c,0x0b6d6dee,0x0094bb72 } },
+    /* 6 */
+    { { 0x120f141c,0x1fcda47b,0x1d6f1d2e,0x13679a5b,0x045c4619,0x1094a088,
+        0x13bf70fd,0x1965efb8,0x00cdd6bb },
+      { 0x0af436fd,0x0533805f,0x04c9afb3,0x08fedb73,0x125226f6,0x13c900a7,
+        0x17d8303e,0x17a97b5c,0x00a361be } },
+    /* 7 */
+    { { 0x197c13c7,0x05512ac2,0x0df0f84a,0x1ac6bea1,0x09d1dc38,0x0d7679e0,
+        0x04b01c0e,0x013896a5,0x00ba12ca },
+      { 0x19f91dfd,0x12047d22,0x1a81fee7,0x0876cd9d,0x00b293af,0x1844cebc,
+        0x1d2c7b3a,0x13ae03fd,0x0053ebb9 } },
+    /* 8 */
+    { { 0x10e63d34,0x1f3f718d,0x1953ead3,0x000ae553,0x1b5a4f46,0x199a6af3,
+        0x00c70124,0x1240daa9,0x008589fb },
+      { 0x0583553a,0x1387ae63,0x1592796a,0x121295c4,0x04652087,0x02838802,
+        0x113f3241,0x0da04a83,0x00ebb069 } },
+    /* 9 */
+    { { 0x0c1647c5,0x10b650ad,0x13d5e651,0x04fa8f89,0x1fbacb81,0x1551bb26,
+        0x168f7199,0x197a364f,0x00eb2820 },
+      { 0x0a87e008,0x0037c6c3,0x08de3ce5,0x1bf53b24,0x0ecb2d87,0x17214066,
+        0x08755bb4,0x136ab4fb,0x001f2828 } },
+    /* 10 */
+    { { 0x1b89da99,0x1dd50601,0x0a1008aa,0x05af3d70,0x005e8a6f,0x1c315c0e,
+        0x158c9e11,0x0b20bca9,0x00337a4b },
+      { 0x01f7794a,0x033a8069,0x1b5fd84f,0x000b6efa,0x1d6e8207,0x1bc08267,
+        0x0f582968,0x1abe985f,0x000d65e0 } },
+    /* 11 */
+    { { 0x15275d38,0x0e84ddf5,0x1828d636,0x114e8a17,0x0b265426,0x17fa4b9f,
+        0x08cbc1d8,0x084a5e94,0x00c23da2 },
+      { 0x0b94520c,0x0d0dc278,0x16f5e397,0x0ccec760,0x09ea1096,0x05c34a69,
+        0x1fc4e937,0x1198f219,0x0019de3b } },
+    /* 12 */
+    { { 0x06c5fe04,0x01d38b61,0x0e86f6c6,0x11bc1677,0x1712c3b2,0x02c35265,
+        0x0ff5d0cb,0x1a923f99,0x00e34dcb },
+      { 0x0aa58403,0x0046a35d,0x1a5e94ed,0x12e90d05,0x0a8af9a6,0x00939b55,
+        0x1dfe78e4,0x088f69c1,0x00e7641f } },
+    /* 13 */
+    { { 0x1f64ba59,0x0ba9ca0e,0x0090bf1f,0x1e21d816,0x01859d33,0x0fe350ac,
+        0x1efd3c1b,0x0ae0a54a,0x004a12df },
+      { 0x1439dbd0,0x1d319c7c,0x194f87ef,0x0497a97b,0x1b314d3c,0x07fd10f8,
+        0x091bf579,0x12776b7d,0x006af5aa } },
+    /* 14 */
+    { { 0x10c91999,0x1085b4c8,0x16012476,0x09688054,0x020900a2,0x0a5a5c66,
+        0x004cf802,0x0b4cd488,0x005fe347 },
+      { 0x193e7b4b,0x07c655ef,0x08fe46ac,0x16a034f8,0x06263292,0x04d7668f,
+        0x04590ba2,0x011d9fd5,0x00b544e3 } },
+    /* 15 */
+    { { 0x16ddfdce,0x03c63748,0x045e7999,0x0522cdf1,0x067e12c3,0x173b26a7,
+        0x082d3a35,0x17b4d618,0x00e0b6b2 },
+      { 0x1b7efb57,0x09896f95,0x031001c3,0x181bbcf2,0x1c9441aa,0x1b56b3cd,
+        0x1dd3e40c,0x1bc4b4c6,0x0071c023 } },
+    /* 16 */
+    { { 0x1fe20925,0x15461225,0x173a19d8,0x0335871f,0x0706391c,0x12eaee9c,
+        0x13d96a5a,0x1a843a64,0x0061d587 },
+      { 0x037173ea,0x03b39d15,0x1de2d97a,0x090010a6,0x0b43e238,0x020f02dd,
+        0x1ef843e1,0x0248c43d,0x00fa11fe } },
+    /* 17 */
+    { { 0x0cb19ffd,0x0448f959,0x048f08c7,0x151ab763,0x1ca8e01b,0x1eb3c562,
+        0x1b72db40,0x0983e277,0x00586eb0 },
+      { 0x07e8ed09,0x01ae3729,0x067b7883,0x03467830,0x052fa1e8,0x0b602b63,
+        0x1c449e3f,0x010e10c9,0x0019d5ac } },
+    /* 18 */
+    { { 0x109a4e1f,0x14cfac09,0x09c01d07,0x1bce37d2,0x08d20ab7,0x1785f7e9,
+        0x18fc9a97,0x07eff38a,0x00e7c007 },
+      { 0x0ef59f76,0x1b6b31d0,0x1f2c1407,0x1676a841,0x002d4669,0x0fbd3d33,
+        0x102b0230,0x1fd8cb67,0x00e08504 } },
+    /* 19 */
+    { { 0x0031b3ca,0x04c7b46d,0x169b59bc,0x19573dcd,0x046e86d1,0x00fd4a79,
+        0x1ad16ff6,0x104b6132,0x0078f018 },
+      { 0x1a25787f,0x1f77ef21,0x132b26ed,0x0df01a3b,0x1fc36801,0x043bd9ad,
+        0x11e833a9,0x170fd28e,0x0043a773 } },
+    /* 20 */
+    { { 0x12b533d5,0x12bbb9a6,0x0f777018,0x1715ed43,0x0c293673,0x1e4d53cf,
+        0x1ac55df9,0x0a38764c,0x00bb6de6 },
+      { 0x165259b3,0x1f4981d5,0x0e9d2039,0x015fa7a0,0x0fc27d6a,0x01e8cd9e,
+        0x066f16b2,0x134ba317,0x0060b461 } },
+    /* 21 */
+    { { 0x1ae5aa1c,0x0b51c708,0x19cd962f,0x0eca5693,0x187edb8b,0x000a772f,
+        0x1f342c4c,0x1655dd7f,0x009d0f27 },
+      { 0x1a730a55,0x1492318b,0x0ef20eb2,0x0ab65fbb,0x19a719c9,0x0ff05600,
+        0x12341f07,0x0da6add8,0x00244a56 } },
+    /* 22 */
+    { { 0x0acf1f96,0x0d81ca57,0x1309c71b,0x02455204,0x1d3b99f2,0x160dc165,
+        0x1da4989a,0x10e6b03d,0x0045e58c },
+      { 0x038f9dbc,0x1ffa3ced,0x02281034,0x15e28dd1,0x0bed7a8a,0x0fd92370,
+        0x1e92516b,0x03983c96,0x00c040e2 } },
+    /* 23 */
+    { { 0x0f8117b6,0x03d78003,0x08d50ce1,0x12d3fee7,0x075eb651,0x1abb0eca,
+        0x1b1d20ac,0x12ed058d,0x001cdf5c },
+      { 0x11f04839,0x0dbbada0,0x1785a61f,0x1d59e891,0x132197db,0x0ee8db85,
+        0x1cf6ca48,0x1f1525bf,0x00046755 } },
+    /* 24 */
+    { { 0x1ce8ffcd,0x04562e95,0x1986a0b3,0x0789165f,0x0d6c70d5,0x10b93901,
+        0x17cfdbc5,0x02277074,0x00046e5e },
+      { 0x18007f01,0x1dc7fb26,0x1d0c60f9,0x03de24b5,0x1a03c7fb,0x0f531af0,
+        0x016c1171,0x186607a0,0x006e0106 } },
+    /* 25 */
+    { { 0x08dd73b1,0x0639ac24,0x17b43652,0x00e11f32,0x02ab7767,0x0f5462b5,
+        0x1c7ce0e1,0x1dbd2039,0x00442594 },
+      { 0x12d4b65b,0x07d51648,0x12430dfe,0x0468772d,0x18d1f94c,0x1250af4b,
+        0x1a3b4c9b,0x0a2985dc,0x00a796fa } },
+    /* 26 */
+    { { 0x023addd7,0x0cfdb024,0x19a4eccd,0x14c307ca,0x13c809e2,0x1bc71e5f,
+        0x1ba7e216,0x1538d2ec,0x00e4ad2d },
+      { 0x0e048a61,0x0bfbfa14,0x04b6680d,0x1a331981,0x0d8ef082,0x0d7a601f,
+        0x050ff0e8,0x08d86f6a,0x00c5e940 } },
+    /* 27 */
+    { { 0x0be75f9e,0x1b529c61,0x048e9e11,0x0353d196,0x1c04b6fd,0x06f85884,
+        0x1d1f6179,0x15fb68c8,0x0063283d },
+      { 0x1af2df15,0x139467bd,0x1669fd33,0x0588aa15,0x0bcc3e59,0x1356f41a,
+        0x04e3eac8,0x15633035,0x0068bd19 } },
+    /* 28 */
+    { { 0x1887d659,0x04756a88,0x164c16b0,0x09abe966,0x14fe3337,0x14c0e7f3,
+        0x1f5a5a61,0x1ea78dfb,0x00495292 },
+      { 0x1acec896,0x143c64f0,0x16d12112,0x096421d8,0x160a7d96,0x1bf13326,
+        0x00dd9a5b,0x01a4c06d,0x000ec753 } },
+    /* 29 */
+    { { 0x0d2687bb,0x0d09d02d,0x0b887e8b,0x1076d5e6,0x0607ba1f,0x0f7a8eea,
+        0x1c2ce43d,0x14cc90c7,0x000f6207 },
+      { 0x0f138233,0x0b3f1dd8,0x0aa9c62f,0x0d72d84e,0x088aedd6,0x02039376,
+        0x173e3b40,0x0e411dad,0x00ff0db0 } },
+    /* 30 */
+    { { 0x0c95d553,0x04fd080a,0x1a02a29d,0x00a5faba,0x1566fa44,0x018bff9d,
+        0x1a8c60ed,0x07910e81,0x00313b51 },
+      { 0x08d11549,0x00171560,0x17b8872d,0x1dc21769,0x0320e071,0x03eea3f9,
+        0x1e049ae6,0x1f30de33,0x002d3abc } },
+    /* 31 */
+    { { 0x015581a2,0x0144280c,0x08846bd3,0x14daacc6,0x12e999a0,0x1d078655,
+        0x137c66e9,0x021bdb31,0x00c036fa },
+      { 0x01fbd009,0x0d7045d6,0x1456058a,0x1163200d,0x00d8f0b6,0x193bcdcf,
+        0x06530bac,0x1896da80,0x00a6b2a2 } },
+    /* 32 */
+    { { 0x0d3549cf,0x019f287b,0x135997b5,0x06d2dff5,0x1fcb46f3,0x1ed66708,
+        0x0181a56f,0x0a55ef93,0x00810ee2 },
+      { 0x1159bb2c,0x0a287f0b,0x02cd5ed9,0x1f7d7ceb,0x1ea72f7d,0x1f3a6b4f,
+        0x1d14ac15,0x0f524e62,0x00d48571 } },
+    /* 33 */
+    { { 0x10cb5a98,0x0ba0d457,0x0c442fc4,0x151f263e,0x02adfd3d,0x1165d59c,
+        0x01386653,0x14e5f34c,0x006a6045 },
+      { 0x02b2411d,0x186069fd,0x03a5b805,0x1d707ca2,0x1b3ccbe0,0x0fb9c432,
+        0x1e40ef32,0x1f5f3c2a,0x00d3e45c } },
+    /* 34 */
+    { { 0x083f7669,0x10fb4ddf,0x01df5af3,0x115d04e5,0x0278d09f,0x172a1922,
+        0x06725522,0x1bdc7858,0x00207755 },
+      { 0x0fef1945,0x1deb0ecb,0x0b4a30e1,0x0279df62,0x164aa188,0x08eb396f,
+        0x00367ef3,0x1cae2a96,0x0048dc5e } },
+    /* 35 */
+    { { 0x17e5a199,0x11bc85ff,0x0732edc4,0x1f719f31,0x19c79e0e,0x15ff0528,
+        0x111709e8,0x1dbbfede,0x00f2fb0a },
+      { 0x10b5025f,0x0e04abaf,0x1ea7c890,0x0a87ae81,0x1fbd0550,0x04569c05,
+        0x14963e8f,0x02bb651a,0x00a13e90 } },
+    /* 36 */
+    { { 0x02b65cbc,0x0fbd1a85,0x119089be,0x0972e454,0x107a10b0,0x1120f11f,
+        0x09bc9973,0x160292ea,0x002bf0d6 },
+      { 0x0b216fb7,0x1ea6e9fa,0x17689ab4,0x0f70cff7,0x0505cf7d,0x1c1fb384,
+        0x027ebade,0x0b42c5fd,0x0042a94a } },
+    /* 37 */
+    { { 0x0aadf191,0x0235685f,0x089a35d6,0x1491204b,0x1c1f60f8,0x182824a6,
+        0x18f7a180,0x0d38cbdb,0x002c2dd9 },
+      { 0x13849c17,0x0810b8ec,0x0894375b,0x0911743b,0x05485460,0x03831e1d,
+        0x16f12043,0x03e858ad,0x00f437fa } },
+    /* 38 */
+    { { 0x0a0f7dab,0x1506b8a2,0x1dba6b1a,0x092f262e,0x197860f0,0x10287af9,
+        0x0aa14b02,0x066a8e0f,0x00aaf45b },
+      { 0x018d364a,0x0f1be19e,0x125c5961,0x17360c7c,0x05444d40,0x0b408af6,
+        0x0af3d05c,0x01be9e4e,0x00cdf631 } },
+    /* 39 */
+    { { 0x0ea8b7ef,0x039e311c,0x0f08a1dd,0x126a310b,0x08e3408e,0x13b915ed,
+        0x1fc90655,0x175b53c5,0x00f0d008 },
+      { 0x0414d3b1,0x089338e9,0x067a9d8a,0x0a930b60,0x1cbdbb37,0x1cb6a29d,
+        0x0e2d7186,0x1eb9510f,0x005bd5c2 } },
+    /* 40 */
+    { { 0x149a3154,0x187a34f7,0x0acba6bb,0x0b4b2adc,0x04a9c3e8,0x160f5549,
+        0x1c6516ab,0x191413c8,0x00aa12df },
+      { 0x0df69f1d,0x1793913a,0x1fd79cc9,0x09905945,0x1dd44e0e,0x0739dbd4,
+        0x0406e763,0x0e7c9195,0x006c036e } },
+    /* 41 */
+    { { 0x0f6e3138,0x07d70950,0x0b4d1697,0x0dde004b,0x12bc5696,0x0325a2b3,
+        0x1892264f,0x0b12d5f7,0x00292ff6 },
+      { 0x1e213402,0x09286a22,0x04b27fb5,0x101c4e87,0x072e8f65,0x1cbfed0e,
+        0x09d825ec,0x1206236e,0x00644e0c } },
+    /* 42 */
+    { { 0x047153f0,0x0f210f0d,0x01063278,0x1876f324,0x17672b86,0x0743b82e,
+        0x09de4ef7,0x127956f3,0x00f25ae7 },
+      { 0x0d869d0c,0x198ca51b,0x01b09907,0x0b910493,0x0945e9d5,0x0f5184b7,
+        0x08f927ed,0x0a627b61,0x0039b8e6 } },
+    /* 43 */
+    { { 0x16fd2e59,0x1baa1005,0x157263cd,0x0580cd24,0x0573935e,0x190d0715,
+        0x0c1b676a,0x05e1e33b,0x0039122f },
+      { 0x03cad53c,0x1de70f00,0x1705f8f3,0x16581fcc,0x13877225,0x18e94d50,
+        0x1e35caeb,0x1f19d01f,0x008de80a } },
+    /* 44 */
+    { { 0x007bbb76,0x1df546c9,0x1e09d62b,0x18fcf842,0x036b1921,0x1ba58e02,
+        0x10137e8a,0x00c5c6d1,0x00871949 },
+      { 0x03993df5,0x0fc945dd,0x0cf49aad,0x1aeb6be7,0x15050639,0x13c542da,
+        0x1784046a,0x0d4b6e9f,0x00fc315e } },
+    /* 45 */
+    { { 0x08d6ecfa,0x10fea0d7,0x1b1fe195,0x1889ec35,0x0741d5f8,0x153da492,
+        0x02226114,0x15bdc712,0x00e6d4a7 },
+      { 0x0593c75d,0x02a9768a,0x09c45898,0x0e1b49ba,0x0c7db70a,0x0f49bdd1,
+        0x195f4abb,0x13537c55,0x0035dfaf } },
+    /* 46 */
+    { { 0x0a736636,0x1cab7e6d,0x0b2adf9a,0x0a3b2f5c,0x0996609f,0x1fa0879a,
+        0x14afec42,0x1ae39061,0x001da5c7 },
+      { 0x1cce6825,0x020f2419,0x15cf0ed7,0x1a231ff2,0x036b815a,0x0963f918,
+        0x075a8a15,0x1fbb7e97,0x007077c0 } },
+    /* 47 */
+    { { 0x06b9661c,0x1b1ffc6a,0x0b3f5c6f,0x1fa6d61a,0x1f8f7a1d,0x10a05423,
+        0x19100dcf,0x05dca1df,0x0053a863 },
+      { 0x096d8051,0x0bb7fb43,0x13d1a282,0x18192b8e,0x026bddae,0x06e1af27,
+        0x13058a65,0x0da69c3f,0x00028ca7 } },
+    /* 48 */
+    { { 0x1c9877ee,0x08ea3ee7,0x074000b4,0x06c42100,0x060b6c8b,0x008baa61,
+        0x011b400b,0x1b0d2c5e,0x0004c17c },
+      { 0x10daddf5,0x0cde84a5,0x1395701b,0x046aea49,0x003b5bea,0x0b73396d,
+        0x11d198cd,0x1d3fdb2e,0x00f7ba4d } },
+    /* 49 */
+    { { 0x0be1263f,0x06dfd1a7,0x0b9f39b4,0x0c6e6ae3,0x0f523557,0x02a9c153,
+        0x11074910,0x000a4263,0x00e31f96 },
+      { 0x0a6b6ec6,0x0ddc90b7,0x10bf1134,0x03a25ce7,0x0a29437a,0x1f5644e8,
+        0x11ef0439,0x0b39c69a,0x00aa3a62 } },
+    /* 50 */
+    { { 0x16f3dcd3,0x1e7cefa9,0x0fdcd83e,0x1bdaa1a5,0x04f5b6ce,0x087d6fa8,
+        0x0bb9245c,0x0c4fcf3b,0x002398dd },
+      { 0x0d09569e,0x1a382d1b,0x127dda73,0x0c3376a2,0x0034cea0,0x01bb9afb,
+        0x0843fe70,0x1643808c,0x005717f5 } },
+    /* 51 */
+    { { 0x01dd895e,0x1f114e49,0x10a11467,0x030a0081,0x17ecd8e5,0x091c8eb1,
+        0x037be84f,0x0ac1c785,0x00660a2c },
+      { 0x167fcbd0,0x06544576,0x0a7c25a7,0x0e48f01d,0x12b4dc84,0x1a40b974,
+        0x114ccacb,0x0989ea44,0x00624ee5 } },
+    /* 52 */
+    { { 0x1897eccc,0x0aa4e726,0x06202a82,0x13a3b27f,0x07c204d4,0x1211821d,
+        0x0f01c8f0,0x1f7257bf,0x004f392a },
+      { 0x1de44fd9,0x0b4fc7d3,0x0cc8559a,0x19f7c8af,0x0bc3cb66,0x14019b47,
+        0x06736cbe,0x0ef99b67,0x008a3e79 } },
+    /* 53 */
+    { { 0x06c4b125,0x0f0c40f8,0x18f2a337,0x09c601ed,0x013e9ae3,0x0cef2e3d,
+        0x1013bda6,0x046e1848,0x003888d0 },
+      { 0x04f91081,0x11401ab2,0x0055411d,0x1f9ec2be,0x0d36e3d9,0x16e43196,
+        0x0cd8609f,0x08e30204,0x00a5e62e } },
+    /* 54 */
+    { { 0x0facd6c8,0x1412f719,0x0f2f1986,0x18c6a8a9,0x19931699,0x16fbcc6f,
+        0x0b70338f,0x1cc8cd4b,0x002c4768 },
+      { 0x10a64bc9,0x1a37fc64,0x1de7d72c,0x14c041c8,0x1e884630,0x08325e02,
+        0x0a836527,0x083f3cca,0x007b5e64 } },
+    /* 55 */
+    { { 0x1d28444a,0x0b4a1160,0x04da8e48,0x0d8bb17c,0x07fcee99,0x17f2fd86,
+        0x11288e1e,0x196191ae,0x00b8af73 },
+      { 0x138b86fd,0x1ef41d51,0x02973fd7,0x07e2b14b,0x09433fee,0x07b79056,
+        0x025727ba,0x0befe7e1,0x00a03639 } },
+    /* 56 */
+    { { 0x010f7770,0x039e35dd,0x0a838923,0x02db0342,0x02b9fa6f,0x1b4128de,
+        0x14cc4037,0x0030ebf6,0x004be36b },
+      { 0x1fb56dbb,0x11304374,0x19e93e24,0x1fdf160f,0x12f20306,0x0602b36a,
+        0x0303bab3,0x10e37b80,0x008cbc9a } },
+    /* 57 */
+    { { 0x00dac4ab,0x098c4ae6,0x0bfc44b8,0x094880e2,0x0ee57a87,0x173e350e,
+        0x17e18cca,0x07c18106,0x0044e755 },
+      { 0x1734002d,0x0a81fffb,0x0d10971b,0x0b971616,0x138b59d3,0x013b0743,
+        0x106257dc,0x074bd71f,0x00470a68 } },
+    /* 58 */
+    { { 0x10513482,0x0dbb0ee4,0x1a49daa0,0x0e405403,0x13083028,0x00f70673,
+        0x1bbf3691,0x1218c7b8,0x00164106 },
+      { 0x0d06a2ed,0x081a5033,0x06c402fd,0x1aee8a31,0x018c9dd4,0x173955c1,
+        0x0d3f6452,0x1faf5797,0x00d73479 } },
+    /* 59 */
+    { { 0x1ad4c6e5,0x16f7d8b2,0x01b4135f,0x19e11eb6,0x1cb14262,0x0dd8c2ba,
+        0x19ac4bb5,0x1c60ee2c,0x00816469 },
+      { 0x161e291e,0x1d5cebca,0x17859875,0x1b5e4583,0x00513eb9,0x13f589af,
+        0x1e73d260,0x047e1ba7,0x000a36dd } },
+    /* 60 */
+    { { 0x01d5533c,0x0c69963a,0x0118a3c2,0x1eb53d0d,0x1bd117c5,0x1456f1a4,
+        0x0460e688,0x1adfb756,0x00e331df },
+      { 0x0bcc6ed8,0x08055b43,0x1e898394,0x01877bde,0x050d7716,0x0cd3de74,
+        0x0e26418f,0x054925c6,0x00d3b478 } },
+    /* 61 */
+    { { 0x13821f90,0x0a4db747,0x1adeab68,0x1bb3dacd,0x1311692e,0x14a98d00,
+        0x16f42ed9,0x0b4990d4,0x00728127 },
+      { 0x13ff47e5,0x01c2c7be,0x00591054,0x0c2d78c2,0x19bb15e1,0x188d3efe,
+        0x01658ac3,0x0fd9c28a,0x002c062e } },
+    /* 62 */
+    { { 0x0159ac2e,0x1b7ccb78,0x16c9c4e9,0x1cee6d97,0x06047281,0x09440472,
+        0x1bc4ab5b,0x1f2589cf,0x00282a35 },
+      { 0x00ce5cd2,0x01aa58f6,0x1e708a67,0x13df9226,0x0c11ecf9,0x179c1f41,
+        0x0af664b2,0x026aa9a5,0x00c71cd5 } },
+    /* 63 */
+    { { 0x09b578f4,0x042ef4e0,0x0bfe9e92,0x09c4b1c7,0x02f1f188,0x18dbac8c,
+        0x0e8e3dda,0x0819e8fe,0x00c50f67 },
+      { 0x174b68ea,0x0e256f99,0x0597f8aa,0x0de646d3,0x13050a40,0x111142d2,
+        0x0370be1a,0x14e4252b,0x00b9ecb3 } },
+    /* 64 */
+    { { 0x14f8b16a,0x17c20877,0x1ec99a95,0x0835fd88,0x087c1972,0x15c736ce,
+        0x0c6c2901,0x0059a855,0x00803f3e },
+      { 0x04dbec69,0x18184d40,0x0eb417df,0x170bee77,0x0197fa83,0x1939d6c7,
+        0x17071825,0x01ca0cf5,0x00c09744 } },
+    /* 65 */
+    { { 0x0379ab34,0x0352b796,0x077e3461,0x1c0d1708,0x068efa8e,0x022c8bb6,
+        0x1cc080c5,0x1ab22be3,0x00f1af32 },
+      { 0x1d75bd50,0x0e1ba98a,0x0bd9ef26,0x19ff75ee,0x1723f837,0x120c246b,
+        0x122c184e,0x061c5a83,0x0023d0f1 } },
+    /* 66 */
+    { { 0x141500d9,0x0bd5b76f,0x0fab6a21,0x1215cbf9,0x059510d8,0x032444b9,
+        0x0b754bfa,0x1ad8147f,0x00b0288d },
+      { 0x050bcb08,0x09907983,0x175b85a1,0x1ec626d2,0x1aa7671a,0x1053dcc4,
+        0x0348c7d4,0x09fe8119,0x00ffd372 } },
+    /* 67 */
+    { { 0x1458e6cb,0x1cb47325,0x1e974a14,0x1b5a4062,0x15f56992,0x1705bd53,
+        0x1b7ce052,0x095af184,0x00f5590f },
+      { 0x0f0ba55a,0x1e125e9e,0x1de2eb83,0x08e49418,0x1674a0fc,0x0327b41d,
+        0x088073a6,0x0a9edee9,0x0018d6da } },
+    /* 68 */
+    { { 0x15be5a2b,0x0c9f112e,0x0d3cf1bb,0x0f3306b2,0x06ffc6fe,0x04931131,
+        0x05a90c50,0x1b2f3204,0x0050bbb4 },
+      { 0x057ec63e,0x1c0c8e37,0x07736c8d,0x04588030,0x0e0f6654,0x04cd811b,
+        0x070d06a0,0x03003fc9,0x002b1001 } },
+    /* 69 */
+    { { 0x1b391593,0x0345ae2c,0x009c3f3f,0x0beb44b3,0x0dcbbc38,0x19d568cd,
+        0x1831c513,0x13307f75,0x00dd5589 },
+      { 0x14b82ff4,0x1dc45c73,0x19cd3264,0x007880e3,0x0322ad2e,0x0f57a1e0,
+        0x010669ea,0x0a2293ac,0x00e6e4c5 } },
+    /* 70 */
+    { { 0x1e9af288,0x0fb2add8,0x0b6a4c55,0x1c34c9ef,0x020e5647,0x1f25e594,
+        0x1bfd0da5,0x1620fdaa,0x0051e00d },
+      { 0x171c327e,0x1e8b4dc3,0x05b0ab50,0x1b641695,0x1477929c,0x08fa9ef5,
+        0x05df01f5,0x08293052,0x00e22f42 } },
+    /* 71 */
+    { { 0x035f1abb,0x0a2f47a3,0x14e21d33,0x18196ad0,0x0034d7ed,0x160fdad4,
+        0x0327251c,0x07aa5b89,0x00f70937 },
+      { 0x08af30d6,0x00cb35dd,0x0deda710,0x1ebe95e2,0x1c47e95b,0x0b1549b0,
+        0x0c44e598,0x111ce4eb,0x00bd52d2 } },
+    /* 72 */
+    { { 0x1c5fa877,0x18aae3d4,0x0e8f522a,0x15ace4fa,0x189d817d,0x1fcf39e8,
+        0x1e990fd0,0x1c99154e,0x00a0d0f8 },
+      { 0x0c94f92d,0x1df57ec6,0x1376ce82,0x11917c18,0x0ba14d81,0x12fc5c17,
+        0x08008b31,0x18f28dad,0x00a56c78 } },
+    /* 73 */
+    { { 0x0dd09529,0x0b11c8d8,0x0b77f3ca,0x1c1d4c7b,0x1f481803,0x1a8fadad,
+        0x19e8b1dc,0x1f0e6346,0x00d8befd },
+      { 0x1c0157f4,0x1c8cea17,0x1239942a,0x195daffd,0x08b0af51,0x05a0016a,
+        0x11e337e7,0x14b9d3ec,0x00854a68 } },
+    /* 74 */
+    { { 0x03506ea5,0x01afb3db,0x1f8359b7,0x0d891349,0x1cd4d928,0x0e9dff4a,
+        0x0a54fc40,0x0173108d,0x005cacea },
+      { 0x1ceac44d,0x086fb064,0x13470eaa,0x0535e86a,0x1babe3db,0x1ef456ae,
+        0x1ea42374,0x0246bc9d,0x00e4982d } },
+    /* 75 */
+    { { 0x034cd55e,0x18825116,0x00344c88,0x12b7664d,0x1d943586,0x0d7d0fd0,
+        0x1267ecd1,0x1ec2d640,0x008046b7 },
+      { 0x18e7d098,0x099ac0f1,0x1bc2dc2d,0x0c3d1be8,0x178c4d7f,0x14f52265,
+        0x1d54c37a,0x0f721055,0x00eb17ca } },
+    /* 76 */
+    { { 0x16a145b9,0x1a8dacc3,0x0f1c7b05,0x1ed61f83,0x115bba5c,0x1ab29c93,
+        0x04c74f80,0x175f56bc,0x00097b00 },
+      { 0x165f69e1,0x1336474a,0x0f94666a,0x11eeb56b,0x1d98477e,0x1d08ed27,
+        0x127980ce,0x0f75fb79,0x00f95c74 } },
+    /* 77 */
+    { { 0x1ebae45e,0x0c780e9d,0x0f1a5555,0x17d3e189,0x04fc6a8e,0x02d8ede3,
+        0x00debadc,0x03cacddb,0x00351260 },
+      { 0x1a1161cd,0x19b78f0f,0x197be1e4,0x1571aa98,0x121e5328,0x17713927,
+        0x0dad1d5f,0x046c0d15,0x000ef971 } },
+    /* 78 */
+    { { 0x14ca4226,0x12cc67ba,0x190b2380,0x1bc271f0,0x017905ee,0x1fba2347,
+        0x12552258,0x066769f7,0x00fc16d9 },
+      { 0x07c800ca,0x14b7d98f,0x1e2b6aaf,0x00c6624c,0x1e8b5138,0x024bb7f9,
+        0x085cf589,0x1e372baf,0x0014ca4a } },
+    /* 79 */
+    { { 0x1d2f81d5,0x123b8dd5,0x1df4659e,0x1f3ad203,0x1c9071a5,0x1f7be56c,
+        0x0c776262,0x0c7eb384,0x004057b0 },
+      { 0x09c05c0a,0x1fec17f4,0x1037e16f,0x0238de3b,0x016dbe49,0x065751ad,
+        0x0c4cefbf,0x0c9e2661,0x001c3b5d } },
+    /* 80 */
+    { { 0x00ec21fe,0x1f0a5ff4,0x156fa097,0x1c22d584,0x05d67f6c,0x0d0397a5,
+        0x0ebe62f1,0x091b6fcc,0x00fad271 },
+      { 0x09ab05b3,0x0605b561,0x0946b9a4,0x1350789c,0x0de7d37a,0x043ae155,
+        0x0a1029f7,0x1c73e1c3,0x0077387d } },
+    /* 81 */
+    { { 0x056c0dd7,0x14f6624d,0x021b1d07,0x1ff9b08c,0x1aecea5c,0x0a047a82,
+        0x11fa3de8,0x1817de18,0x00b37b85 },
+      { 0x0c0e6a8f,0x0cb5b726,0x0e23c8cd,0x1a977ed6,0x0ef4efd6,0x09fd61ce,
+        0x0356ae91,0x191f3ec5,0x009c135a } },
+    /* 82 */
+    { { 0x04e35743,0x15519014,0x08f37bcc,0x1ad5630b,0x19819320,0x18bb0ef8,
+        0x147ee086,0x03f88670,0x00572136 },
+      { 0x11fc9168,0x186d9b53,0x17100f07,0x1174e6bc,0x0d8f55f9,0x143f1bde,
+        0x06f7d932,0x193cd762,0x00dcbac3 } },
+    /* 83 */
+    { { 0x0518cbe2,0x00eccb42,0x07ac13bc,0x05f83139,0x1eebfd24,0x11e3f23f,
+        0x0189c9d9,0x13c5ac4d,0x00b8c1c8 },
+      { 0x08e1d569,0x0d2c5eee,0x16233414,0x1013916f,0x131eb563,0x1fecf88f,
+        0x0b509b09,0x1b45f284,0x005d23bb } },
+    /* 84 */
+    { { 0x15c8f8be,0x10e394a4,0x1cd8afc2,0x03890077,0x1d4ac296,0x0201efb1,
+        0x04027906,0x19723d9d,0x00c109f9 },
+      { 0x18945705,0x1684ae82,0x1ae17030,0x107b2dbb,0x0449bb90,0x15c6bd20,
+        0x1b8611a4,0x09e5ddc3,0x009bc334 } },
+    /* 85 */
+    { { 0x02913074,0x0ad71ab2,0x0950ac43,0x12364e91,0x0732a554,0x1332d988,
+        0x13051a72,0x0a4be349,0x0029591d },
+      { 0x184f983f,0x1b7adb5d,0x17e13879,0x1dde833e,0x0a189be7,0x0a4b405d,
+        0x0cb04803,0x03e31de6,0x00637655 } },
+    /* 86 */
+    { { 0x162976cc,0x0d2f8a72,0x1c4b0e2f,0x1947cc1d,0x0985222b,0x18323665,
+        0x01eaefe8,0x19011c53,0x00bdb79d },
+      { 0x0b06a772,0x0965ae4e,0x14db73bf,0x08eb55fc,0x15db838f,0x10113e15,
+        0x052b0a8f,0x0035ba78,0x008ee860 } },
+    /* 87 */
+    { { 0x04ade873,0x1f4b4c0d,0x1ee92332,0x13549b89,0x14ba57ee,0x144cad02,
+        0x092cb3b8,0x0f4deef5,0x0092e51d },
+      { 0x1190a34d,0x045d7d43,0x0f47b465,0x11eeb7ed,0x11144d69,0x13718657,
+        0x0aab403b,0x0de14ad5,0x005182f8 } },
+    /* 88 */
+    { { 0x1a4cc99c,0x1d310963,0x1b67287e,0x0136d07c,0x18c5aff6,0x13e5ad64,
+        0x1bc976ec,0x0ba80e74,0x0091dcab },
+      { 0x1f575a70,0x0db661ea,0x0361fe80,0x06c272df,0x017360cb,0x074644cc,
+        0x1cac5975,0x1b72f2e9,0x0017a0ce } },
+    /* 89 */
+    { { 0x076c8d3a,0x0430f150,0x03e492ce,0x155a7242,0x035d9701,0x157209d4,
+        0x1d065343,0x0d8fe99b,0x002e8ce3 },
+      { 0x037a862b,0x0939ed58,0x19323ea4,0x15376ec1,0x0f2dd01b,0x09c419dd,
+        0x03cfe591,0x19669ecd,0x00f4ccc6 } },
+    /* 90 */
+    { { 0x11f79687,0x077a92e7,0x1bea0551,0x12a92b25,0x18d297c5,0x0ba0d2e3,
+        0x0f27848c,0x111341be,0x00ac0db4 },
+      { 0x1f01747f,0x15fe388e,0x05f7c4e1,0x1726b1de,0x16bb5592,0x0727ae65,
+        0x128b9620,0x0c32992e,0x0095a64a } },
+    /* 91 */
+    { { 0x015a4c93,0x160f7ed6,0x1614505c,0x0d36e704,0x10bad402,0x1d8e0b65,
+        0x19ddaa37,0x17452420,0x00231e54 },
+      { 0x0ae6d2dc,0x186fc8bc,0x044a4629,0x154c7e72,0x172234d6,0x1935af2d,
+        0x0787d89d,0x065b14e6,0x00ab0be0 } },
+    /* 92 */
+    { { 0x0d131f2d,0x0bd6874c,0x013c4042,0x1e13c676,0x1a748637,0x10cb6af4,
+        0x19e46b21,0x10059ed4,0x00f1bcc8 },
+      { 0x08daacb4,0x0e348a07,0x1d940249,0x1c80aac1,0x137a63c4,0x047e23bc,
+        0x09c56473,0x0d2b5d76,0x00851694 } },
+    /* 93 */
+    { { 0x11dcf593,0x11ae0a1f,0x062f8ef7,0x00565360,0x19d3d782,0x16e14dee,
+        0x1763a736,0x1a5b55aa,0x008f67d9 },
+      { 0x1481ea5f,0x0088b2b3,0x13164321,0x05bbd3c6,0x13fa8e7d,0x01fa0282,
+        0x0d77ff75,0x17380e51,0x00f84572 } },
+    /* 94 */
+    { { 0x17af71c9,0x10d3d38c,0x1cd95957,0x092888f4,0x15063a14,0x1703870e,
+        0x106686d2,0x020c2d65,0x00edee27 },
+      { 0x11734121,0x1781a7a8,0x097a7c2c,0x18dcaa94,0x02ecf1ca,0x0479d206,
+        0x1fd23705,0x13689d7a,0x009fd27e } },
+    /* 95 */
+    { { 0x16e2cb16,0x063b2c57,0x16466d8f,0x16fa59fc,0x15583e3e,0x0c0b0b46,
+        0x0e1d6a31,0x16d2b1fe,0x00a40c2f },
+      { 0x1edcc158,0x04f62b07,0x1c8c15a3,0x10098cab,0x07e127ad,0x13824d18,
+        0x1b3f64e5,0x170fb8db,0x0099bc9b } },
+    /* 96 */
+    { { 0x127dafc6,0x054a90ec,0x02734661,0x03f6d2b8,0x06dde52c,0x00d07c9b,
+        0x19927656,0x01742daf,0x009abe21 },
+      { 0x08915220,0x0057c252,0x1605b192,0x062ed49b,0x1ca5afa7,0x1cc38b40,
+        0x12c31f54,0x0af0fe68,0x007881c2 } },
+    /* 97 */
+    { { 0x00bcf3ff,0x19ccda8f,0x1fdd3da4,0x05978a24,0x1d9680d0,0x12d16e80,
+        0x05023ed1,0x033461d1,0x0015e6e3 },
+      { 0x1e0e05f4,0x036b7069,0x16210119,0x0f7bb886,0x050d3fad,0x03e8e27c,
+        0x0b3af987,0x19e3222e,0x000e55fa } },
+    /* 98 */
+    { { 0x18787564,0x14ecc037,0x1a17399f,0x062e4263,0x1e8d61a3,0x0c655c0c,
+        0x15ddac05,0x0ecdfd2c,0x00d73d09 },
+      { 0x1eb7206e,0x1241a128,0x062ed090,0x12521f8c,0x0a520a51,0x1c2caf18,
+        0x142d772e,0x0e91e2b4,0x009250a3 } },
+    /* 99 */
+    { { 0x1e577410,0x17f847c5,0x1dea31b2,0x011406a0,0x063a4fd4,0x1944f605,
+        0x102fc7d8,0x10583991,0x00774140 },
+      { 0x0b0991cd,0x0d207d37,0x1f70a581,0x1410cc93,0x0fd40c1c,0x11e3d992,
+        0x02e4e9a2,0x09a25d64,0x008cb04f } },
+    /* 100 */
+    { { 0x0906171c,0x0e1682ab,0x09030fec,0x07d39b60,0x06841907,0x15a7ec48,
+        0x0d476e39,0x1de8e247,0x00e4e429 },
+      { 0x18ec36f4,0x1c6ea9e1,0x12da89c2,0x05b803fe,0x09a48f9d,0x1703c3cd,
+        0x15497419,0x1fe78dcc,0x0037bca2 } },
+    /* 101 */
+    { { 0x1f562470,0x06971e3e,0x0592b253,0x04e54581,0x193be44f,0x0efcc063,
+        0x08a9f1b5,0x1b860056,0x0059913e },
+      { 0x1750592a,0x109cd41a,0x00f7809e,0x003b01cf,0x1d64f99e,0x01baf502,
+        0x089b3e30,0x0956027c,0x0043786e } },
+    /* 102 */
+    { { 0x1e56b5a6,0x1995876c,0x1f1a3e7f,0x01b34db3,0x046a7075,0x1422acbc,
+        0x19ebb057,0x1316fcf3,0x008638ca },
+      { 0x0afc24b2,0x1ad704b0,0x0b3a3c8b,0x131d5e9b,0x1a78f053,0x0ee85765,
+        0x1bc0edd9,0x0d4f6754,0x001ecdd3 } },
+    /* 103 */
+    { { 0x0c5ff2f3,0x09d66b13,0x1cea5e17,0x0a2d8050,0x10d54a2d,0x04fd6908,
+        0x0cb6b653,0x10ba8b3e,0x00d85d0f },
+      { 0x10b11da3,0x1b805c68,0x00c63127,0x0458614f,0x0decdd2c,0x047a4904,
+        0x118955a6,0x18769da7,0x00a04f19 } },
+    /* 104 */
+    { { 0x0d7f93bd,0x03c92647,0x0bd47d82,0x0958ba72,0x171afcb6,0x1985410d,
+        0x02c1f2b8,0x1d4b812a,0x0092b2ee },
+      { 0x05b6e235,0x0d6264a4,0x0db03c21,0x19495252,0x08891ab2,0x1359f028,
+        0x1db203ea,0x042b0684,0x001ee782 } },
+    /* 105 */
+    { { 0x063e79f7,0x10517007,0x067641a9,0x01cf65e7,0x1c09df59,0x02a53303,
+        0x05424084,0x1b0af4dc,0x00f3f2ce },
+      { 0x110d9b55,0x0028879f,0x19099208,0x1f9f59b0,0x10e7c9d2,0x0d53f45e,
+        0x0843958c,0x0a87b47c,0x000f56a4 } },
+    /* 106 */
+    { { 0x1043e0df,0x190dffd0,0x001f9b56,0x096d9938,0x0517a6c7,0x17606a54,
+        0x098c6995,0x08232d3c,0x00bd8f17 },
+      { 0x1eb7494a,0x14dddc35,0x1cee0e22,0x0fa8de8b,0x1a79a156,0x0953d272,
+        0x08277de8,0x06a6199f,0x002d1a1c } },
+    /* 107 */
+    { { 0x106508da,0x0971c09a,0x15e569c6,0x03018943,0x144b3336,0x0ca4bd4c,
+        0x091b376d,0x0bd723f7,0x00a107a6 },
+      { 0x0f94d639,0x168e8e28,0x162df5f9,0x15e6eb14,0x1ca1c8b4,0x0ac25e9b,
+        0x0bc869f1,0x015f0f53,0x00183d76 } },
+    /* 108 */
+    { { 0x0dde59a4,0x0eb4b888,0x02fbe1ca,0x1b1a0e1d,0x0be78f1a,0x04b1a797,
+        0x1d508a6d,0x13b84d3a,0x001d4417 },
+      { 0x0390d30e,0x196e067c,0x1a04432c,0x164ea61b,0x0339a0a3,0x0ee295e0,
+        0x0988c6bc,0x1852c0da,0x00771f9c } },
+    /* 109 */
+    { { 0x05040739,0x0cc9f3bc,0x09aa4e66,0x073b7300,0x0fc26445,0x1b797afc,
+        0x063b3d03,0x06206c4e,0x0064427a },
+      { 0x05428aa8,0x1a796c3c,0x1ed26a13,0x15b87fd7,0x101ac7b7,0x1636f91e,
+        0x15b4806c,0x092d5d21,0x0049d9b7 } },
+    /* 110 */
+    { { 0x035d1099,0x03c6c5e2,0x03468233,0x179a9d1d,0x08a412ad,0x1150165b,
+        0x11140b0b,0x0367ec0a,0x009037d8 },
+      { 0x074c7b61,0x06dd6138,0x0ff5cb9f,0x006356af,0x15352fe2,0x164b2cb6,
+        0x0e718733,0x0d4f980c,0x0008c3de } },
+    /* 111 */
+    { { 0x16d552ab,0x07ee8107,0x13607c48,0x15ff300b,0x1129156b,0x1e1f489a,
+        0x0cbc1bed,0x0848af2d,0x00c69094 },
+      { 0x01231bd1,0x1d9d74e2,0x11608145,0x18dd0eb9,0x0a1221ea,0x1bd5fceb,
+        0x0b008220,0x00595fc7,0x003fa3db } },
+    /* 112 */
+    { { 0x05058880,0x1ad1f328,0x0e50fcb5,0x06cbdec8,0x049257da,0x030e7d59,
+        0x03fd051e,0x161fb701,0x00c5c4bd },
+      { 0x1272b56b,0x1a89f1a5,0x0e410e9c,0x04fd2a23,0x04969c83,0x11befc42,
+        0x1ad7f633,0x1288d856,0x002d56db } },
+    /* 113 */
+    { { 0x1f46ac6b,0x030bc17f,0x08b90949,0x1ef24c0f,0x08de1d19,0x11e204d2,
+        0x090bebfa,0x13bca077,0x000f56bd },
+      { 0x145cda49,0x1bea7689,0x1bca6744,0x02b1f902,0x03402821,0x12a5575a,
+        0x17c79f1a,0x13a22e76,0x004003bb } },
+    /* 114 */
+    { { 0x00803387,0x1c740c4d,0x12f5010e,0x022bea73,0x17f21ece,0x1046e943,
+        0x1e790a5c,0x04540fe5,0x00537655 },
+      { 0x08a4182d,0x04c0510d,0x0677de69,0x17a0f464,0x1a2d4a2b,0x05170d0c,
+        0x15259d34,0x0b0d8ba8,0x007a056f } },
+    /* 115 */
+    { { 0x1d8a2a47,0x03592ac4,0x17c9dcd9,0x10529187,0x0d5395b5,0x000755f8,
+        0x19d547b0,0x1e2f4344,0x0077d482 },
+      { 0x07853948,0x050decac,0x1efffbae,0x102f7ad9,0x01e47a6f,0x002bc034,
+        0x0392adbb,0x05656716,0x00411501 } },
+    /* 116 */
+    { { 0x0de28ced,0x039f87a3,0x04fb11cf,0x1b4ec136,0x063921d5,0x074f372e,
+        0x051986e3,0x0e5f7d41,0x00cdf045 },
+      { 0x0c53c3b0,0x059e2c5b,0x1ee10f07,0x1c782088,0x1780e97f,0x0570965c,
+        0x0427ecae,0x1b52e706,0x00ee703d } },
+    /* 117 */
+    { { 0x1f57e43a,0x028a8a07,0x0e046e0d,0x0cc1a763,0x0b986d44,0x0effc7a1,
+        0x1884aced,0x13b42c59,0x002a0ad8 },
+      { 0x0bc277ba,0x072534a3,0x10709d99,0x1192a982,0x16274c78,0x1326655f,
+        0x1964506a,0x0cf58568,0x00d62d0b } },
+    /* 118 */
+    { { 0x0c054ac4,0x0e2ec3d9,0x1f7de20e,0x00b0b3e4,0x128d6570,0x05f9d8c0,
+        0x109bb7df,0x1e532384,0x00b39a23 },
+      { 0x10b16ae5,0x094250af,0x0dbd46e5,0x140b6342,0x007830c6,0x009bf938,
+        0x1314758f,0x12580ce9,0x0004ed00 } },
+    /* 119 */
+    { { 0x1ae90393,0x1a0c2e8c,0x0f593987,0x0f685294,0x0fc14304,0x00d34c2a,
+        0x0e1eb800,0x18202ef8,0x00a0a91f },
+      { 0x0e2c831e,0x1851f80d,0x1c9f85bf,0x0d5d0456,0x075b4bb7,0x0450ad18,
+        0x11063c4b,0x1113da41,0x00084cf9 } },
+    /* 120 */
+    { { 0x1ca6becf,0x0c284ef7,0x1fecca36,0x1d5d00fb,0x0e8b92fc,0x0ae223bc,
+        0x1df97628,0x164e757e,0x00d57955 },
+      { 0x11b5d4f1,0x086d3cf1,0x1e9e8708,0x05e09679,0x1c20baa5,0x1044ee13,
+        0x07c75344,0x08405a28,0x008e14ea } },
+    /* 121 */
+    { { 0x12897042,0x16a81a2f,0x100b12bb,0x0a663e86,0x1fb218d0,0x00ca645e,
+        0x05632367,0x06e5549a,0x00597e1a },
+      { 0x0f0bd68c,0x193f60d6,0x00925140,0x17c1b956,0x03e846d4,0x06bd64ff,
+        0x17a96e72,0x06c33369,0x00ca3f02 } },
+    /* 122 */
+    { { 0x0170bd20,0x095085ab,0x0fd779d6,0x112fe2da,0x0ade20ea,0x1ff8a259,
+        0x1f928cd8,0x0fc61380,0x00bde7fd },
+      { 0x18f5432c,0x0b5db695,0x10d112d4,0x1b8397c0,0x15b5a210,0x0f37fc7c,
+        0x0660f6c0,0x01c14fba,0x00b623ad } },
+    /* 123 */
+    { { 0x00c7b65b,0x1adeb3ab,0x0928a269,0x18ab2047,0x06795ab8,0x07e86bd9,
+        0x0defe088,0x08cb1d82,0x00d6aa2e },
+      { 0x1138bb85,0x055e005a,0x0cea5704,0x03a243b0,0x0a32e8c3,0x18058b81,
+        0x04eac93f,0x1c05b98a,0x00111662 } },
+    /* 124 */
+    { { 0x0fb42b87,0x008a00af,0x1b137fde,0x1ebae036,0x1c129bd9,0x066bd3eb,
+        0x03e19bb3,0x197296ea,0x00db3ee1 },
+      { 0x134837cf,0x1379ed87,0x15e353ec,0x1da31772,0x0657de7e,0x0fc9be2b,
+        0x096574b3,0x084a440d,0x00886a64 } },
+    /* 125 */
+    { { 0x05b569ea,0x011a67db,0x0846704f,0x022283ee,0x0619e200,0x042ed0ad,
+        0x1ef22eb7,0x1d603142,0x00a70cf4 },
+      { 0x0c4a6a65,0x127cbd74,0x0d0de3c8,0x0b9e4e02,0x0096036e,0x104f27bf,
+        0x0ddef8e9,0x157a2e8f,0x00aa4772 } },
+    /* 126 */
+    { { 0x1aa60cc0,0x1b3b098b,0x1a0457d9,0x02c6c206,0x1bb5ac79,0x05da5de0,
+        0x05d37b66,0x1b861f5f,0x00611a6d },
+      { 0x015ee47a,0x073c65e6,0x0365a94c,0x12c5049c,0x1ed882e8,0x0d6f9eec,
+        0x1220dbcd,0x1f02c853,0x005cfffa } },
+    /* 127 */
+    { { 0x1b7a99cd,0x06aa67fc,0x0f116870,0x07733b08,0x139e17bf,0x0847b163,
+        0x05300e2a,0x046fb833,0x006e5a6b },
+      { 0x0ba5db77,0x1c5a2a70,0x1d8358fb,0x1100ff59,0x08378b7b,0x00633b30,
+        0x0f339647,0x11a485b5,0x00481a23 } },
+    /* 128 */
+    { { 0x15d0b34a,0x1a0bde01,0x09f029f8,0x1670d706,0x162d1440,0x1316d601,
+        0x050e3edc,0x099c19bf,0x002c4111 },
+      { 0x0d95a0b1,0x1d2e778d,0x1550d88a,0x166f50cf,0x086c9c09,0x06e900f2,
+        0x0a5c9b5b,0x17e85ff2,0x0020477a } },
+    /* 129 */
+    { { 0x18d65dbf,0x1ba8b9e0,0x07b6b60b,0x1f281c67,0x1001c77b,0x0935ee78,
+        0x1ad9c08b,0x1358ee72,0x00ac6640 },
+      { 0x06261cc3,0x185d9b7e,0x039fa422,0x1ef79232,0x06c10213,0x075d522f,
+        0x1e159507,0x0eb98245,0x00ce8e69 } },
+    /* 130 */
+    { { 0x1c0a67d2,0x1890da0d,0x13492283,0x08ec1488,0x1473762d,0x078eb2cd,
+        0x12a03811,0x0ca4a176,0x0008fde3 },
+      { 0x048bf287,0x07761ed4,0x0da75bab,0x0c4305a6,0x09482c2a,0x0fee4922,
+        0x135cd60b,0x1a4acbad,0x002f7e2f } },
+    /* 131 */
+    { { 0x03770fa7,0x125c96de,0x0410fe6b,0x1d1ab86f,0x01171095,0x074e8bbb,
+        0x0ab953cd,0x05d20ee0,0x00c65be9 },
+      { 0x16fd0a40,0x1ac5181f,0x139e12c9,0x1045c779,0x167bfe7d,0x1ac2a7cb,
+        0x0ce9eb93,0x08fa2327,0x004bff8e } },
+    /* 132 */
+    { { 0x00ff1480,0x0a0e90f8,0x1536c5b3,0x11f6fa0e,0x0f3ea2ab,0x0977ddf0,
+        0x19f6b207,0x1ccaee52,0x003e4e4a },
+      { 0x1c5303e6,0x10c79b69,0x0988e5df,0x13329724,0x0c3c03bd,0x07130992,
+        0x00a27b5c,0x1fab1d8c,0x005388ae } },
+    /* 133 */
+    { { 0x1e5d7713,0x0898bf5a,0x179276ab,0x130bdceb,0x1b26109b,0x1e27e3a7,
+        0x1838cbd6,0x1a29eeb7,0x005cf908 },
+      { 0x0e657b12,0x1021a884,0x1bb6799d,0x08434b72,0x0ccc2bfd,0x1a8fc4b8,
+        0x138838a7,0x080c1e01,0x00a698ba } },
+    /* 134 */
+    { { 0x0f748fec,0x1ed8b437,0x074b3e5c,0x0eab44fd,0x05effe6e,0x12a26713,
+        0x16358c2d,0x114f5d75,0x00b142ef },
+      { 0x17d5770a,0x098d7cf8,0x0cd04beb,0x1e76ce59,0x159de66a,0x068def99,
+        0x01d5af58,0x12cb0a2a,0x00d1896a } },
+    /* 135 */
+    { { 0x13c41c08,0x02cabd59,0x1a38b87b,0x1d2958a8,0x12f6c87d,0x15b9d623,
+        0x08e46205,0x016f303b,0x00267b0e },
+      { 0x0e62b988,0x12aa72ec,0x1b4879db,0x1b8eaa22,0x06f99d8d,0x1d781e95,
+        0x0e4d1843,0x0f542232,0x00b54e28 } },
+    /* 136 */
+    { { 0x178a876b,0x100915a8,0x14412d02,0x1f2dfe10,0x09f7651f,0x18d58a79,
+        0x1398142c,0x116bf0fa,0x0084abb2 },
+      { 0x0270790a,0x0f6a1cfc,0x18fd1af5,0x196b3b0b,0x022122d6,0x0e0db60f,
+        0x1901d7d5,0x0ce2ecaa,0x00e5436f } },
+    /* 137 */
+    { { 0x0286e8d5,0x1fc812f1,0x1114ef94,0x192b690c,0x0e3a0353,0x1adef204,
+        0x067b60cb,0x116b739d,0x000404f6 },
+      { 0x0781e8e5,0x1699def5,0x0f0bd6f2,0x1ea0302c,0x1caa33cd,0x14b0008c,
+        0x1c055d5d,0x1be15838,0x003a4263 } },
+    /* 138 */
+    { { 0x1aeb596d,0x14b2f664,0x0f24ad30,0x1407ce04,0x1396101e,0x1a5b1700,
+        0x0d9d1c12,0x07f20bd4,0x000ca8fd },
+      { 0x151b2b61,0x1291d212,0x03f341a4,0x0f513872,0x0a63e1eb,0x095f01c9,
+        0x10cf9fc7,0x0c89bb61,0x0096dca2 } },
+    /* 139 */
+    { { 0x187510af,0x01dda1d1,0x08da8048,0x1fd55153,0x10378846,0x0bb817ca,
+        0x077348e9,0x024755ab,0x004363e2 },
+      { 0x00246a47,0x121d0e3a,0x17749372,0x0571a5ca,0x1af96b36,0x03022ec7,
+        0x0313e6c2,0x0b9b1773,0x00840e11 } },
+    /* 140 */
+    { { 0x1023e8a7,0x09102f10,0x171e82fc,0x11519bb1,0x05ddfc80,0x11390b1d,
+        0x1b538a4a,0x17a61bda,0x005e0d6a },
+      { 0x1cfc0f64,0x1d390e13,0x157b6201,0x1d803a1c,0x19db242e,0x1f7c8e8f,
+        0x09689a9e,0x1e8528b4,0x007dea48 } },
+    /* 141 */
+    { { 0x05060a81,0x1efb78e7,0x1e55856a,0x1f38e5f1,0x0268be79,0x162a0356,
+        0x1b473f4d,0x17dd7fa2,0x00abc2a2 },
+      { 0x13e2eac7,0x16337c8e,0x174119a2,0x0174c7a5,0x0d31b6f1,0x11bb8141,
+        0x1f059e43,0x128d8fdd,0x004ea353 } },
+    /* 142 */
+    { { 0x1266309d,0x0c517c6a,0x05168fbb,0x038d8103,0x05dc10a5,0x1a2d2bc6,
+        0x1f0f3b2b,0x1123929f,0x003a76e6 },
+      { 0x1d7b0d0f,0x15674523,0x161297e6,0x159d2d1e,0x17fbe963,0x06392734,
+        0x1191468c,0x0148cbcc,0x008212a1 } },
+    /* 143 */
+    { { 0x0fab8caa,0x1be30e1e,0x0508e43b,0x171d081c,0x133ca18e,0x1fb3bf4b,
+        0x05933477,0x0e2b3396,0x00aa7cab },
+      { 0x1c837bd1,0x17e4939d,0x1abd75c0,0x080fa186,0x1da49c06,0x09497a11,
+        0x1f0c5d88,0x0e7fc0c2,0x0040e380 } },
+    /* 144 */
+    { { 0x07bf9b7c,0x07c04125,0x0f8c343d,0x1a46407f,0x19ce3365,0x09904be7,
+        0x149afef9,0x001660aa,0x00e36047 },
+      { 0x0cc6c2c7,0x0e5cc88b,0x132fb993,0x106e1174,0x0d9ec726,0x0a1a31bd,
+        0x057f737b,0x0ef47bdc,0x006542d6 } },
+    /* 145 */
+    { { 0x1b6c377a,0x1995b683,0x0d122f8f,0x00708f20,0x08af76cb,0x09d4106d,
+        0x1c875bf7,0x1dc1376d,0x00a6534a },
+      { 0x1035facf,0x050bc068,0x12d1f98c,0x0ab4673b,0x1f39335e,0x07f0e223,
+        0x1c89ba94,0x05fb935d,0x00f3cb67 } },
+    /* 146 */
+    { { 0x1b55fd83,0x19b8cff1,0x1777443a,0x0f48d90e,0x0a784e0d,0x0fd482e7,
+        0x039cceb2,0x05d55d0e,0x007cafaa },
+      { 0x1d53b338,0x1c0a6820,0x01f9b1a6,0x198141df,0x12b0fe0a,0x088408b3,
+        0x08bbee4f,0x183737aa,0x000aab13 } },
+    /* 147 */
+    { { 0x12681297,0x0e6713c6,0x02551ab7,0x0a1d636a,0x1aaf2cb3,0x18b9bb30,
+        0x0ba4b710,0x00508e02,0x004b91a6 },
+      { 0x12f8ddcf,0x07f884ab,0x0446bd37,0x17ec3d35,0x0430e08e,0x1b0561b9,
+        0x12ad23d0,0x0a6e4643,0x0049534c } },
+    /* 148 */
+    { { 0x107b7e9d,0x1efbeb8f,0x13545be0,0x11df4627,0x07ee3a47,0x1325b602,
+        0x17b9e3bc,0x09facb58,0x00caf46c },
+      { 0x12aa8266,0x026863bc,0x0da12ee8,0x08a8cd22,0x116b0edf,0x08b45725,
+        0x1c3d5b99,0x0ae098ce,0x0014ce9e } },
+    /* 149 */
+    { { 0x165e8f91,0x0a22f1f4,0x03c924a6,0x19437596,0x0a0a0d3a,0x0387c864,
+        0x09c74c73,0x14a7c993,0x001bb708 },
+      { 0x158bdd7a,0x0e54f34a,0x0289ac75,0x140a1003,0x0f1ec734,0x1538a64e,
+        0x040ac24e,0x1e5b4600,0x00f9d126 } },
+    /* 150 */
+    { { 0x0ff9563e,0x04de53d5,0x0645281d,0x0ef5fd69,0x11671dd0,0x0188dfaf,
+        0x11a789e8,0x172e53d9,0x00807afc },
+      { 0x09b08b77,0x1c5499be,0x0f1f8e1f,0x074f0a88,0x1d8ba86c,0x1d2ca3b7,
+        0x163217eb,0x1a2cad19,0x00751adc } },
+    /* 151 */
+    { { 0x10715c0d,0x1751c5a0,0x1da5fde2,0x07d4e31e,0x1f06dd11,0x158a49fd,
+        0x10fd997a,0x0d04a6ee,0x0029ec44 },
+      { 0x150bebbc,0x0ca38ce5,0x1415088f,0x1dcb7fc8,0x1edb1399,0x0d9d4696,
+        0x1df64335,0x1c725480,0x00ff9370 } },
+    /* 152 */
+    { { 0x06b75b65,0x0d16b4de,0x19947156,0x11f1aa4c,0x1d7d2418,0x199f1ef4,
+        0x0068a2a7,0x1174553a,0x00977647 },
+      { 0x129af2c7,0x0293116c,0x1a4248e2,0x1ebada9c,0x051e9334,0x03f2d44d,
+        0x0beb39b3,0x07f585f0,0x0074a631 } },
+    /* 153 */
+    { { 0x175f079c,0x17a6feed,0x18dbeeec,0x00f92a31,0x136dd85b,0x1e7873e6,
+        0x18f46db3,0x02a1fe90,0x00ab75be },
+      { 0x173fc9b7,0x0d9b3e00,0x1653f420,0x14e841a4,0x11236b90,0x1f81e204,
+        0x07d857f6,0x05c1688b,0x004ebeac } },
+    /* 154 */
+    { { 0x1c9f2c53,0x1b62ff3a,0x0ba5047a,0x0440231d,0x0c5d8d25,0x1b19fcad,
+        0x1ff32221,0x0f658375,0x00df9988 },
+      { 0x050aaecb,0x1bc77694,0x15a89cae,0x12303603,0x1bcac9d4,0x0a88d8e6,
+        0x01625e37,0x14eef3e8,0x0027b040 } },
+    /* 155 */
+    { { 0x173b2eb2,0x0202edbf,0x06c84624,0x1f0a111c,0x0327ee0d,0x18a92cb1,
+        0x0fd5406d,0x06fc99f4,0x00b393dd },
+      { 0x1fd75165,0x091873d9,0x14cd5528,0x06898579,0x15022d66,0x18df07bd,
+        0x1065b0db,0x025a08c6,0x0009588c } },
+    /* 156 */
+    { { 0x02601c3b,0x043049f8,0x170cd7f8,0x04a5f19e,0x0ff28fb0,0x194044a5,
+        0x122e5573,0x153b73ec,0x0081c879 },
+      { 0x06f56c51,0x007343e6,0x05d86301,0x08e2d27e,0x1353bfed,0x0520c82c,
+        0x0f1113e2,0x1eabf823,0x00fa0d48 } },
+    /* 157 */
+    { { 0x01608e4d,0x0370e4ef,0x00a08b2f,0x1bb4226b,0x0c2d7010,0x0ee08abf,
+        0x1f5bdadf,0x0ad6d46c,0x008ea0e1 },
+      { 0x0383b3b4,0x1aa70179,0x007d4f28,0x0cd7287e,0x03ca5699,0x119596f0,
+        0x16b13fd9,0x049f4016,0x003f5ab9 } },
+    /* 158 */
+    { { 0x19739efb,0x1bdd86ca,0x1afb034c,0x0361e9cf,0x067d1c75,0x16eb208d,
+        0x15b8b694,0x10e56e84,0x008bc768 },
+      { 0x02d3d253,0x0df1db94,0x035de7e9,0x0cf343eb,0x167bba9f,0x00b470b3,
+        0x0d3e872b,0x120c1f9e,0x00b386f1 } },
+    /* 159 */
+    { { 0x0fedcfc2,0x0f9e09a9,0x1e2bc34c,0x0d7ec4c5,0x088c2539,0x1a7572b9,
+        0x1136680a,0x1ee360d3,0x004cb460 },
+      { 0x1b8095ea,0x133da69a,0x101d80eb,0x17f0b2df,0x0a16592b,0x0fb35b0a,
+        0x088f851d,0x0112bdea,0x0052c0d5 } },
+    /* 160 */
+    { { 0x15339848,0x18e10870,0x1de32348,0x1451d0e0,0x0e170e87,0x1330b4ab,
+        0x102e7477,0x07057613,0x004ac3c9 },
+      { 0x0998987d,0x0df02a8b,0x027d3586,0x06ed895c,0x1933d8b2,0x1bb28d1f,
+        0x17d07782,0x18fc72e0,0x00380d94 } },
+    /* 161 */
+    { { 0x01542e75,0x0d1aad54,0x006e6dc0,0x0e4943dc,0x1708796c,0x14bbb126,
+        0x1ebdace8,0x0e3bc4c6,0x002ce3e1 },
+      { 0x15d5bc1a,0x1f7f5a4f,0x1df8ad73,0x0ac0fc4e,0x1756ca65,0x1617ca89,
+        0x19353faa,0x0a416c49,0x002e6cd8 } },
+    /* 162 */
+    { { 0x0c31c31d,0x142caa5c,0x1c86830d,0x067a00b7,0x19ec9685,0x11373ae3,
+        0x15502f5d,0x08e858d3,0x00ca1775 },
+      { 0x16d2dbb2,0x0376d7ff,0x12a74633,0x1b197a2e,0x178e8fd0,0x03c9d522,
+        0x139a1d7a,0x02739565,0x00a976a7 } },
+    /* 163 */
+    { { 0x13fb353d,0x1328f8dc,0x1f3e9c82,0x195716af,0x15281d75,0x07d398d8,
+        0x0666aa23,0x02e143e9,0x008720a7 },
+      { 0x093e1b90,0x01f469bb,0x1db7f0e3,0x0bb8162d,0x08742d34,0x08055a95,
+        0x04f23aa3,0x0538ed31,0x009719ef } },
+    /* 164 */
+    { { 0x18e35909,0x10776c6a,0x177045a0,0x0db1b867,0x05026936,0x0ce83710,
+        0x13075fe6,0x0edc2ae0,0x00a50729 },
+      { 0x04e70b2e,0x0151bf56,0x042aa280,0x19ecaed1,0x12a5c84d,0x1f8c322d,
+        0x1c9735c6,0x13bef6ee,0x0099389c } },
+    /* 165 */
+    { { 0x1ada7a4b,0x1c604793,0x0e24d988,0x1d3a07fa,0x1512c3ab,0x1744bb37,
+        0x0b91ad9c,0x15440590,0x00a88806 },
+      { 0x1380184e,0x10102256,0x1aa2e159,0x16f18824,0x04f17a8c,0x186056c2,
+        0x13f9e759,0x1f68e71b,0x000043bf } },
+    /* 166 */
+    { { 0x16d5192e,0x0acdaee1,0x042cabe3,0x110ba68b,0x01781acf,0x168508b0,
+        0x019a0d59,0x00374d89,0x0052f3ef },
+      { 0x0edcb64d,0x0c339950,0x1a0de7ce,0x10584700,0x0f3090a4,0x12fd3820,
+        0x19d45b2f,0x1133de4f,0x003296bd } },
+    /* 167 */
+    { { 0x054d81d7,0x1b55d44a,0x1ae6cf11,0x1bcfdea3,0x179869ea,0x10e6c0e2,
+        0x07a58668,0x17f5dcae,0x003b90fe },
+      { 0x1496f7cb,0x1c9811f2,0x0d46f124,0x1c83b0ff,0x0b5ce55b,0x0ea44cdf,
+        0x0c600fc7,0x13b3f021,0x006e8806 } },
+    /* 168 */
+    { { 0x143ea1db,0x11bd588d,0x1674a4b3,0x1fe352a4,0x0f1860a7,0x0110c7c2,
+        0x144e146c,0x1d5bdf55,0x00a7222b },
+      { 0x0b0a9144,0x1563c761,0x1e967168,0x0480a3e5,0x1ce385a0,0x1652b0a3,
+        0x1a424747,0x04778558,0x00be94d5 } },
+    /* 169 */
+    { { 0x0b226ce7,0x17a4a2f0,0x1fa2dc1c,0x1fae8f2c,0x0c63eb8a,0x0378c2d3,
+        0x1d9bb7a9,0x1fd37d18,0x007782de },
+      { 0x1db38626,0x10695521,0x1d9eb45d,0x15cf0eed,0x19cdb460,0x037e2a24,
+        0x192cd06e,0x0cf45125,0x00038385 } },
+    /* 170 */
+    { { 0x19ec1a0f,0x0c6d77eb,0x0ce725cb,0x19adfb9d,0x01a953bb,0x0ffe2c7b,
+        0x1083d55d,0x1895bef6,0x00dbd986 },
+      { 0x15f39eb7,0x0d5440a0,0x0365db20,0x05f9eb73,0x1717d6ee,0x03aee797,
+        0x0f415195,0x188d0c17,0x008e24d3 } },
+    /* 171 */
+    { { 0x1a587390,0x04ec72a4,0x0fb1621d,0x16329e19,0x183c612b,0x1ed2592c,
+        0x1f211b81,0x18880f75,0x00541a99 },
+      { 0x024c8842,0x1920b493,0x1b017ff6,0x098255b0,0x1cf62604,0x0a5a27bf,
+        0x17471674,0x093eafa6,0x00c0092c } },
+    /* 172 */
+    { { 0x1f2e61ef,0x1e63ae1e,0x06cd72b4,0x1083905c,0x129f47e8,0x1868c84f,
+        0x113718b4,0x068e50d2,0x0075e406 },
+      { 0x1bc237d0,0x1ea0fe2d,0x13c07279,0x06f7e1d8,0x1d534c95,0x0d0b1415,
+        0x161a4714,0x0b18f090,0x005b7cb6 } },
+    /* 173 */
+    { { 0x0a28ead1,0x12538424,0x0ed1fda5,0x1b8a11fa,0x05b39802,0x1fe8bb3f,
+        0x1e866b92,0x1751be12,0x007ae13e },
+      { 0x0add384e,0x090b77c7,0x0cbfc1bf,0x0345b36d,0x1b5f3036,0x0c3c25e6,
+        0x0ff4812e,0x0e9c551c,0x00787d80 } },
+    /* 174 */
+    { { 0x157fbb1c,0x0f12eb5b,0x08077af1,0x17bb6594,0x033ffe47,0x14d1b691,
+        0x12112957,0x0333de50,0x005c2228 },
+      { 0x08315250,0x19ea542c,0x1c25f05d,0x04345704,0x1d33f21b,0x0750ef7a,
+        0x0ac2adf1,0x15775e1e,0x00e45d37 } },
+    /* 175 */
+    { { 0x08511c8a,0x16f8f1a1,0x129b34f4,0x0453917b,0x039a7ebb,0x18d3b13e,
+        0x074d5e29,0x04509bf7,0x00ed7bc1 },
+      { 0x13dea561,0x191536fc,0x03c3b473,0x07e31ba9,0x123e8544,0x10a02dd6,
+        0x149f62e1,0x1928b94d,0x00aac97c } },
+    /* 176 */
+    { { 0x016bd00a,0x1aa753a5,0x102f307a,0x13d35beb,0x1fc06d83,0x1bf88fcd,
+        0x113824ae,0x16622c7b,0x00318f97 },
+      { 0x030d7138,0x06062df6,0x10c0883b,0x11be4757,0x0360644e,0x0b97d811,
+        0x1d34aede,0x1433509f,0x00fa41fa } },
+    /* 177 */
+    { { 0x06642269,0x0016cba5,0x0de0ef51,0x10299d37,0x1e60bc81,0x1c723ca0,
+        0x0788e634,0x0583a4dd,0x0038bb6b },
+      { 0x0a577f87,0x1272512b,0x047f8731,0x05a4a7b8,0x007288b5,0x155fb114,
+        0x0697fccd,0x00b9cec0,0x0094dd09 } },
+    /* 178 */
+    { { 0x1e93f92a,0x0b67bee6,0x0d7cc545,0x06679713,0x1e750a01,0x06fce4ca,
+        0x0ba40901,0x0cfa4b85,0x00920778 },
+      { 0x0bf39d44,0x1238f008,0x0ed4f5f8,0x1920412d,0x03d8f5f2,0x1bd9ae4e,
+        0x0d453112,0x117a537d,0x0081e842 } },
+    /* 179 */
+    { { 0x0477199f,0x0ece15d6,0x17b3765b,0x11dddcd6,0x0fd0e8cb,0x0d9ff720,
+        0x12c62bdf,0x0c5b77f4,0x001b94ab },
+      { 0x0e47f143,0x0786c59e,0x1d1858d1,0x0c47f8c7,0x1938351e,0x1387e62c,
+        0x03bbc63c,0x0500aab2,0x0006a38e } },
+    /* 180 */
+    { { 0x13355b49,0x12d809cd,0x1afe66cb,0x04cac169,0x1f3dc20e,0x1d35e934,
+        0x13e3023f,0x04107b3a,0x00a7b36c },
+      { 0x1b3e8830,0x068ae1d0,0x07e702d9,0x19d5c351,0x16930d5f,0x12517168,
+        0x08833fbb,0x16945045,0x00be54c6 } },
+    /* 181 */
+    { { 0x0d91167c,0x166d9efc,0x099897b5,0x187ef3cf,0x0c7f4517,0x12479a35,
+        0x0aedc415,0x157d5c04,0x00bf30a5 },
+      { 0x13828a68,0x13bc2df4,0x0fbc0da3,0x038664fe,0x146b2516,0x0ff5ac90,
+        0x04eb846d,0x1bc4e65a,0x00d1c820 } },
+    /* 182 */
+    { { 0x1038b363,0x01f09a3c,0x01794641,0x023ea8d6,0x0cad158c,0x1d5f3013,
+        0x168d3f95,0x1dad1431,0x00b7d17b },
+      { 0x029c2559,0x0652c48f,0x1fff6111,0x1406ecb7,0x069484f7,0x1257ba72,
+        0x11912637,0x0bcc8259,0x003997fd } },
+    /* 183 */
+    { { 0x0bd61507,0x103a3414,0x09934abc,0x0265aa69,0x015e329e,0x0fd84545,
+        0x0fa3ffb7,0x05278d82,0x000eeb89 },
+      { 0x07e259f8,0x0db4d1f5,0x0f9f99fa,0x1b6fcda2,0x1a685ce1,0x0c7b568f,
+        0x1bbc9dcc,0x1f192456,0x00228916 } },
+    /* 184 */
+    { { 0x0a12ab5b,0x0cd712d8,0x1ef04da5,0x022e3f2a,0x02b0ccc1,0x014f68b7,
+        0x05fa0161,0x03add261,0x00ec05ad },
+      { 0x0c3f3708,0x0bdd2df5,0x0d675dc5,0x15f26a61,0x034e531b,0x091b88c1,
+        0x0cdd1ed5,0x0acffe23,0x007d3141 } },
+    /* 185 */
+    { { 0x16dfefab,0x1ece02e7,0x0cddc1de,0x1e44d1b9,0x0bb95be2,0x16cb9d1c,
+        0x1e8f94fa,0x1f93783a,0x00e9ce66 },
+      { 0x0f6a02a1,0x0d50abb3,0x19803b5d,0x010fbec1,0x1c1b938c,0x1f9a3466,
+        0x1947e251,0x002e4500,0x00d9650b } },
+    /* 186 */
+    { { 0x1a057e60,0x025a6252,0x1bc97914,0x19877d1b,0x1ccbdcbc,0x19040be0,
+        0x1e8a98d4,0x135009d6,0x0014d669 },
+      { 0x1b1f411a,0x045420ae,0x035da70b,0x175e17f0,0x177ad09f,0x17c80e17,
+        0x062ad37b,0x0821a86b,0x006f4c68 } },
+    /* 187 */
+    { { 0x16c24a96,0x1936fa74,0x0f6668e1,0x1b790bf9,0x0e30a534,0x17794595,
+        0x0aecf119,0x1fac2313,0x004c4350 },
+      { 0x1855b8da,0x0b3fb8b7,0x0f0e284a,0x0847288c,0x1334341a,0x0a09f574,
+        0x02d70df8,0x084b4623,0x00a726d2 } },
+    /* 188 */
+    { { 0x148c1086,0x17359f74,0x14e8b876,0x1ca07b97,0x022f3f1d,0x169f81e8,
+        0x0e48fcd7,0x10598d9e,0x0013639e },
+      { 0x0dafaa86,0x1649c7de,0x15289626,0x178bf64c,0x11329f45,0x19372282,
+        0x168c658e,0x1c383466,0x00ca9365 } },
+    /* 189 */
+    { { 0x0c3b2d20,0x10ad63aa,0x138906cd,0x14a82f20,0x1071d742,0x10e2664e,
+        0x0a96c214,0x0692e16e,0x009ce29c },
+      { 0x0d3e0ad6,0x0640fb9b,0x1e10d323,0x01b53de5,0x062d9806,0x0e8d3674,
+        0x1e60d7b4,0x1af56855,0x0048c4ab } },
+    /* 190 */
+    { { 0x00c7485a,0x110d8662,0x09d36ff4,0x08ab77ca,0x1d2e8ead,0x1b4c4931,
+        0x0f2d24f1,0x065ecf66,0x0078017c },
+      { 0x130cb5ee,0x0e9abb4c,0x1023b4ae,0x029d2818,0x11a4dc0d,0x1faa9397,
+        0x1013e2de,0x0a9bcb83,0x0053cd04 } },
+    /* 191 */
+    { { 0x1d28ccac,0x06ac2fd2,0x16dd1baf,0x047cac00,0x123aa5f8,0x1850e680,
+        0x0a3df1e7,0x183a7aff,0x00eea465 },
+      { 0x0551803b,0x00832cf8,0x19abdc1e,0x16b33ef9,0x08e706c0,0x13b81494,
+        0x064d0656,0x148f5cd2,0x001b6e42 } },
+    /* 192 */
+    { { 0x167d04c3,0x14049be7,0x1bae044b,0x0257c513,0x14d601e3,0x0c43c92c,
+        0x14f55ad7,0x02830ff7,0x000224da },
+      { 0x0c5fe36f,0x1d5dc318,0x1d47d7e1,0x1e78c09d,0x029ec580,0x18dfd9da,
+        0x1cce593e,0x1e0857ff,0x0060838e } },
+    /* 193 */
+    { { 0x1e0bbe99,0x19659793,0x0a8e7b90,0x1489e609,0x139037bd,0x1e3d4fd4,
+        0x190d7d25,0x0045a662,0x00636eb2 },
+      { 0x13ae00aa,0x07e8730c,0x0b9b4bff,0x1401fc63,0x1901c875,0x0c514fc9,
+        0x0eb3d0d9,0x16c72431,0x008844ee } },
+    /* 194 */
+    { { 0x0b3bae58,0x0a0b8e93,0x18e7cf84,0x07bee22f,0x0eada7db,0x1e3fc0d4,
+        0x027b34de,0x1b8a3f6f,0x0027ba83 },
+      { 0x1bf54de5,0x1efa1cff,0x1f869c69,0x0e06176b,0x17a48727,0x071aed94,
+        0x12ad0bba,0x0690fe74,0x00adb62d } },
+    /* 195 */
+    { { 0x0175df2a,0x188b4515,0x030cba66,0x15409ec3,0x10916082,0x19738a35,
+        0x02cb2793,0x0ecebcf9,0x00b990fd },
+      { 0x0df37313,0x014ecb5a,0x0d01e242,0x00aaf3a1,0x077111c2,0x17253c04,
+        0x06359b26,0x1f29a21a,0x0081707e } },
+    /* 196 */
+    { { 0x03d6ff96,0x1ebe5590,0x010cd825,0x0a37f81b,0x0db4b5b8,0x11e26821,
+        0x09709a20,0x1d5ab515,0x003792da },
+      { 0x141afa0b,0x140c432c,0x160d9c54,0x13ce8285,0x0e0a7f3e,0x1293adf2,
+        0x06e85f20,0x0bd29600,0x005abd63 } },
+    /* 197 */
+    { { 0x0ac4927c,0x13fd4270,0x1233c8dc,0x10c06b4f,0x0a0dfe38,0x0af5256e,
+        0x184292f3,0x04308d56,0x005995bf },
+      { 0x029dfa33,0x087c305c,0x03f062fa,0x1fc55d2b,0x10366caa,0x17a23c31,
+        0x047a6cee,0x145a9068,0x0044c32c } },
+    /* 198 */
+    { { 0x040ed80c,0x1a54bf8f,0x14b2a0a9,0x07196263,0x16ad95f9,0x0925be16,
+        0x15314fc8,0x1f701054,0x001f2162 },
+      { 0x120b173e,0x1233e62b,0x17c4be5f,0x114ccc10,0x165dc40e,0x0107264e,
+        0x1f2633af,0x05787d20,0x008f1d40 } },
+    /* 199 */
+    { { 0x1bc4058a,0x1ac97ce7,0x0bd59c13,0x1c296c52,0x18c57b15,0x1f1bde0e,
+        0x0fe71573,0x08724ddb,0x00b1980f },
+      { 0x12c76b09,0x0619f049,0x0c1fde26,0x0a4f3a67,0x1b4611df,0x156a431d,
+        0x1915bc23,0x1366e891,0x002828ad } },
+    /* 200 */
+    { { 0x04cf4ac5,0x0b391626,0x1992beda,0x18347fbb,0x10832f5a,0x1d517044,
+        0x0e401546,0x04eb4296,0x004973f1 },
+      { 0x122eac5d,0x0cec19a9,0x166d5a39,0x0fddea17,0x083935e0,0x1907d12c,
+        0x0b1eacd9,0x1a1b62d1,0x006dac8e } },
+    /* 201 */
+    { { 0x0da835ef,0x1daa2d77,0x043b547d,0x0227a43a,0x01b094aa,0x12f009ba,
+        0x19300d69,0x0b24173b,0x004b23ef },
+      { 0x1c4c7341,0x015db401,0x162f0dfa,0x0ee0da7e,0x03ee8d45,0x1c31d28f,
+        0x0939cd49,0x069bbe93,0x004dd715 } },
+    /* 202 */
+    { { 0x15476cd9,0x1ca23394,0x069c96ef,0x1a0e5fc6,0x167e0648,0x045c7e25,
+        0x16ec5107,0x0005e949,0x00fd3170 },
+      { 0x0995d0e1,0x05a1ffa4,0x1dca6a87,0x0d2ba21d,0x1898276e,0x1cbb20bc,
+        0x0d978357,0x1192ad3e,0x0014fac5 } },
+    /* 203 */
+    { { 0x1312ae18,0x0cd0032f,0x124ff26b,0x0b1b81f9,0x12846519,0x0120453e,
+        0x09436685,0x0a26d57b,0x00ed7c76 },
+      { 0x05d4abbc,0x113878d1,0x0844fa91,0x1bb1e7e3,0x1952f9b5,0x183aada8,
+        0x1d4f1826,0x1ee9a5d3,0x00fefcb7 } },
+    /* 204 */
+    { { 0x1a119185,0x084a4bd5,0x1116e92f,0x1d186155,0x01179d54,0x1cef5529,
+        0x002d2491,0x0fd0fc1b,0x001801a5 },
+      { 0x1cafffb0,0x19e9fc6f,0x09549001,0x0678175c,0x1dfbc6cf,0x1b1dadaf,
+        0x0191e075,0x03c3d5a2,0x009f8fc1 } },
+    /* 205 */
+    { { 0x1e69544c,0x0c1d0b8a,0x12de04c5,0x1f0acfe0,0x04c320ea,0x147e93c5,
+        0x06a4788a,0x13a7a74d,0x00a9d380 },
+      { 0x19a2da3b,0x1b616162,0x057211e4,0x1979ec31,0x1086938c,0x122731ea,
+        0x1bdd7994,0x15dc22f1,0x003006b9 } },
+    /* 206 */
+    { { 0x09eead28,0x1d8f9586,0x1d37ef02,0x1ec6bb13,0x089397ee,0x0bfed967,
+        0x1d841d1d,0x1ae8bf1e,0x000ab85f },
+      { 0x1e5b4549,0x06d3e499,0x048bc87b,0x0576b92f,0x180404be,0x093a5a1d,
+        0x0b089868,0x0ea23d28,0x00b122d6 } },
+    /* 207 */
+    { { 0x06a5ae7a,0x1f303df3,0x0b72f8ce,0x0e07f4ed,0x0e5c501e,0x0180a75b,
+        0x0bb2be41,0x18212fb7,0x009f599d },
+      { 0x0ff250ed,0x0badb8c0,0x0688371b,0x122ae869,0x027a38eb,0x02d20859,
+        0x0de10958,0x1c114529,0x007d5528 } },
+    /* 208 */
+    { { 0x00c26def,0x07ac7b31,0x0acb47bc,0x0b0bd4b0,0x03881025,0x0bcd80e7,
+        0x1cc3ef9f,0x002607e2,0x0028ccea },
+      { 0x19644ba5,0x0ed5e68b,0x1ffc2e34,0x0c87d00d,0x1e17b1fc,0x1b7e3359,
+        0x0efe9829,0x09143a02,0x00c18baf } },
+    /* 209 */
+    { { 0x1dc4216d,0x0731c642,0x1850ab0d,0x0020ce40,0x1064a00c,0x10b8cafa,
+        0x05af514e,0x13b6f52b,0x009def80 },
+      { 0x07ab8d2c,0x0f432173,0x0de8ad90,0x080866c4,0x0218bb42,0x1536b262,
+        0x1395f541,0x160d1011,0x000357f8 } },
+    /* 210 */
+    { { 0x0cd2cc88,0x14edf322,0x0e3ce763,0x03851be1,0x0a0c8cc6,0x0c3a6698,
+        0x021d28c2,0x1ba36913,0x00e4a01a },
+      { 0x157cd8f9,0x168f7567,0x1653120b,0x0cfa7d7a,0x0f7871b7,0x0e38bde9,
+        0x10c29ca5,0x0f39c219,0x00466d7d } },
+    /* 211 */
+    { { 0x1dada2c7,0x1e98c494,0x06a89f51,0x014d871f,0x059e14fa,0x1e944105,
+        0x146a4393,0x0448a3d5,0x00c672a5 },
+      { 0x1d86b655,0x0303e642,0x0b52bc4c,0x06ba77f3,0x172a6f02,0x03402b88,
+        0x144e6682,0x1f5e54ce,0x005e3d64 } },
+    /* 212 */
+    { { 0x1b3b4416,0x1320863c,0x0c9b666a,0x1f9f0bd5,0x16a74cd8,0x1ba56db2,
+        0x0bf17aff,0x12bd71c8,0x006c8a7a },
+      { 0x102a63bd,0x06305d3d,0x03c011c4,0x1e460717,0x190b06b2,0x1b9c1896,
+        0x0a4631b0,0x0455b059,0x00348ae4 } },
+    /* 213 */
+    { { 0x1ccda2fb,0x1a3a331a,0x01c9b49f,0x1995431c,0x11f2022a,0x1bc12495,
+        0x14ba16b7,0x1c1b3de5,0x00c1074d },
+      { 0x0e9a65b3,0x079e7225,0x15c546ff,0x03c9580b,0x09788fd7,0x0fa86735,
+        0x1ff351c4,0x1b793ca9,0x00fbadfb } },
+    /* 214 */
+    { { 0x00a99363,0x189f8e69,0x1c89dd45,0x0acb1ed9,0x159b2b91,0x1ae69269,
+        0x1f365a05,0x16906e2d,0x00b7f976 },
+      { 0x1d6dbf74,0x1ac7126a,0x10ebcd95,0x0775fae3,0x1dfe38d2,0x1bb00121,
+        0x001523d1,0x05d95f99,0x00f4d41b } },
+    /* 215 */
+    { { 0x1dabd48d,0x0f8e7947,0x101e2914,0x037c6c65,0x146e9ce8,0x14ba08b8,
+        0x1c41ab38,0x1d5c02c1,0x00180824 },
+      { 0x06e58358,0x1c3b4c5b,0x1b28d600,0x0d0ea59c,0x1e6c5635,0x071a2f20,
+        0x149608e0,0x073079ed,0x0067e5f6 } },
+    /* 216 */
+    { { 0x0f4899ef,0x04e65c6e,0x0ed1303e,0x002be13d,0x18ec9949,0x093b592c,
+        0x1f1951be,0x13409823,0x009fef78 },
+      { 0x13d2a071,0x09b3f67a,0x1466c25b,0x1c34ff48,0x02eefb10,0x1fd8308f,
+        0x188329ac,0x10353389,0x00bc80c1 } },
+    /* 217 */
+    { { 0x05eb82e6,0x1929b7c7,0x1b2e4825,0x109f8fea,0x1da5e1a4,0x10b8a85a,
+        0x1c431e38,0x0c53f19b,0x0049270e },
+      { 0x0a6b50ad,0x11cdbddf,0x0e23ff06,0x05098344,0x1197b9a0,0x158bc083,
+        0x1dfd500f,0x1f2c26e5,0x00d2ee52 } },
+    /* 218 */
+    { { 0x08e0362a,0x1be6942c,0x09765374,0x1f514f1f,0x0a526442,0x1b72d21a,
+        0x1ccebfe0,0x17dcb576,0x00dfb478 },
+      { 0x073eede6,0x08f8e73b,0x16cbc12a,0x1215a856,0x0da2fa53,0x1bdfaa98,
+        0x1ce9799b,0x16811be8,0x00d9a140 } },
+    /* 219 */
+    { { 0x0e8ea498,0x10110dab,0x18fb8243,0x08f0526a,0x12ade623,0x01c899ae,
+        0x0c6b81ae,0x11ac47e9,0x00760c05 },
+      { 0x0198aa79,0x1c4dac66,0x1eae9fc2,0x1121a5e0,0x0556af74,0x00887ef1,
+        0x10253881,0x05b1e320,0x00714198 } },
+    /* 220 */
+    { { 0x0d4b0f45,0x1850719a,0x0aa5385b,0x10167072,0x01d5ed92,0x126359e3,
+        0x191cebcc,0x19d13aa9,0x003af9d1 },
+      { 0x00930371,0x0c7bcc09,0x105c25ff,0x04cc9843,0x0309beda,0x02ee6e21,
+        0x17583a55,0x186e72af,0x00b1f815 } },
+    /* 221 */
+    { { 0x09fec44a,0x07d53c74,0x0a932be1,0x055c8e79,0x0a624c8c,0x003ee0db,
+        0x0149a472,0x0282a87e,0x00a41aed },
+      { 0x1d5ffe04,0x121a9ccb,0x16db8810,0x1965bec4,0x177758ba,0x105f43c0,
+        0x03be1759,0x1bb0df6c,0x00d6e9c1 } },
+    /* 222 */
+    { { 0x06853264,0x15174bf6,0x0c1282ce,0x0a676fc4,0x0e9be771,0x15dbdc75,
+        0x03086e44,0x0215d37f,0x009c9c6e },
+      { 0x0030b74c,0x1184d2cf,0x18c7a428,0x0e929ad4,0x179f24ed,0x0591d24d,
+        0x06da27d1,0x12c81f4c,0x00566bd5 } },
+    /* 223 */
+    { { 0x018061f3,0x136008c6,0x00ff1c01,0x164ba6f9,0x13245190,0x04701393,
+        0x117bc17f,0x121ea4a6,0x00cf2c73 },
+      { 0x10eb30cf,0x04de75a0,0x1ddc0ea8,0x05d7741a,0x1f255cfd,0x021d0a87,
+        0x05e7a10b,0x0ab15441,0x0002f517 } },
+    /* 224 */
+    { { 0x0ddb7d07,0x0b77bca5,0x1155400e,0x1f8e8448,0x0a3ce0b4,0x075663c5,
+        0x05f7ebfe,0x14bd1a9b,0x0014e9ad },
+      { 0x0f7079e2,0x15240509,0x0c2003b6,0x15479bc9,0x0157d45b,0x0f16bc1c,
+        0x0ba005d9,0x1571d3b3,0x00a0ad4f } },
+    /* 225 */
+    { { 0x0a653618,0x1fdbb10a,0x1aaa97c2,0x05027863,0x09d5e187,0x139ba24a,
+        0x1478554f,0x170dcadd,0x00bcd530 },
+      { 0x12e9c47b,0x14df4299,0x00166ac5,0x0eedfd6a,0x1fbb4dc2,0x0bb08c95,
+        0x107736ea,0x19ed2f26,0x00909283 } },
+    /* 226 */
+    { { 0x16e81a13,0x1d801923,0x05c48e59,0x1c3532c4,0x019d69be,0x1b0de997,
+        0x126823b4,0x19359c2a,0x0035eeb7 },
+      { 0x1e4e5bdc,0x140572d3,0x13bb1b84,0x1a59a76d,0x06bc12dc,0x11263713,
+        0x01914b90,0x1e88915d,0x009a8b2c } },
+    /* 227 */
+    { { 0x09d03b59,0x1238df90,0x16bcaafd,0x1cc5476c,0x1eec9c90,0x18b475ea,
+        0x0de7fdff,0x1e9a8922,0x006bdb60 },
+      { 0x0a55bc30,0x16d7f5e4,0x025ff836,0x1d5a2c20,0x03bddc79,0x0ba0a60f,
+        0x02a50b86,0x1fb29741,0x0001ec3c } },
+    /* 228 */
+    { { 0x1c9485c2,0x1313bf5e,0x1ec431ee,0x1934f245,0x08d8a48c,0x0b07b851,
+        0x13d93d87,0x1808ea8c,0x00d1acb1 },
+      { 0x06f36612,0x13481589,0x186362f4,0x07489dc0,0x157ee59c,0x14099841,
+        0x1b0937e2,0x13a80ac4,0x007dcd07 } },
+    /* 229 */
+    { { 0x105a4b48,0x073ea69f,0x08c1dc97,0x1a52a46e,0x0915aadc,0x1cb8c095,
+        0x06e3463d,0x1126efa3,0x000bf535 },
+      { 0x0c68ea73,0x0f66cad3,0x0e96134d,0x07779504,0x1a723c7f,0x1a637a39,
+        0x1bf27ed9,0x1b3c2cd0,0x00d28be4 } },
+    /* 230 */
+    { { 0x18fa8e4b,0x095cc831,0x0ff63f17,0x1e30dd12,0x1b6fc559,0x115521b7,
+        0x0338e9b7,0x154a21f1,0x00d76007 },
+      { 0x123a4988,0x088555b2,0x17409ccb,0x0b9e88e9,0x07278b45,0x184151a0,
+        0x0c05fd19,0x0d166077,0x00f2b52f } },
+    /* 231 */
+    { { 0x1835b4ca,0x0abf57d4,0x19a72f03,0x0465f976,0x031982d2,0x1b406332,
+        0x14ea3bba,0x11d98b5d,0x00d8dbe9 },
+      { 0x05a02709,0x1d4df1fe,0x0e87ea32,0x1cd1cbeb,0x0a85230b,0x01e6f887,
+        0x1c17faf5,0x147dcab2,0x00e01593 } },
+    /* 232 */
+    { { 0x0a75a0a6,0x1f2d7a87,0x01600cf4,0x044d58af,0x16406512,0x0a87e80b,
+        0x1c19bf9b,0x1635d71d,0x00afec07 },
+      { 0x00bb0a31,0x1dccab3c,0x0c26ab9f,0x15e7986e,0x1f3896f1,0x10ad00d5,
+        0x1f76454e,0x0a8dc5b7,0x00a71b93 } },
+    /* 233 */
+    { { 0x18f593d2,0x1c709700,0x1e048aef,0x12085140,0x0f2add1a,0x02ed85d2,
+        0x0f645414,0x0b8c50a4,0x0053a200 },
+      { 0x07f2b935,0x1e45b1cf,0x00a58681,0x1f2eb583,0x0ca2c2bf,0x1753ba8c,
+        0x18f61af3,0x1367ab11,0x00bf47d1 } },
+    /* 234 */
+    { { 0x1d7665d5,0x194b3d3e,0x0bd37959,0x0060ae5e,0x0903f4e3,0x02d7406a,
+        0x06d85100,0x0fe73934,0x00001c2c },
+      { 0x09efc6d6,0x01d400a3,0x11e9c905,0x017b54f7,0x150a4c81,0x1385d3c0,
+        0x066d7d95,0x1cf0dff7,0x00fdadf8 } },
+    /* 235 */
+    { { 0x1fc00785,0x09c65c47,0x123ad9ff,0x14eb2276,0x08fbc77f,0x082adf9b,
+        0x12501153,0x09ab5487,0x003a838e },
+      { 0x1e97bb9a,0x10b31949,0x07653655,0x1266c688,0x12a839eb,0x08d3056d,
+        0x168d4556,0x0af0e7c3,0x003cdb82 } },
+    /* 236 */
+    { { 0x1de77eab,0x1b8a054b,0x19204244,0x038a1a82,0x1d0dff7e,0x05696758,
+        0x1ee9d8b7,0x113e3eaf,0x005a60cc },
+      { 0x00d45673,0x059b1c12,0x04f19560,0x057c32b2,0x0b7411b8,0x025c6eb2,
+        0x1f0015ca,0x0dfb7fb1,0x00922ff5 } },
+    /* 237 */
+    { { 0x09a129a1,0x1932ef76,0x0a138106,0x039caf98,0x1be3ca5b,0x0623675f,
+        0x158810e0,0x0fbed8b9,0x0072919a },
+      { 0x0fb90f9a,0x0c7a29d4,0x1900c6ca,0x13801711,0x11856d71,0x073bbcb7,
+        0x026b8cb0,0x1006c481,0x005e7917 } },
+    /* 238 */
+    { { 0x1f63cdfb,0x00b762ab,0x12b93f57,0x146ae3e3,0x197ca8e6,0x15f52b02,
+        0x1eaff389,0x0e3c4985,0x004e0a53 },
+      { 0x05765357,0x1b52069d,0x1ce8ad09,0x135e881a,0x11a323c8,0x185720e8,
+        0x13bae3cd,0x031aacc0,0x00f5ff78 } },
+    /* 239 */
+    { { 0x1a09df21,0x1f9f1ff0,0x1ba391fe,0x0ba51dcc,0x0901526d,0x1e8514e4,
+        0x1990825a,0x1d2a67eb,0x00e41df0 },
+      { 0x13ba9e3f,0x02fed205,0x0136254c,0x0819d64c,0x167c7f23,0x10c93f81,
+        0x157c219b,0x0dd589e2,0x008edd7d } },
+    /* 240 */
+    { { 0x0bfc8ff3,0x0d0ee070,0x0dbd0bf2,0x1fb057d2,0x181ef14e,0x17be6651,
+        0x1a599c05,0x195db15d,0x001432c1 },
+      { 0x10b23c26,0x0342414b,0x0d6c9cfb,0x1fd0e60e,0x10f5aa64,0x1b72f577,
+        0x0b1b8e27,0x016b591a,0x00caef48 } },
+    /* 241 */
+    { { 0x15315922,0x122e4bc3,0x18f32954,0x12a2e260,0x0f2cbd82,0x10685b27,
+        0x08dbcf39,0x0fd1df5c,0x00d0ba17 },
+      { 0x11b3af60,0x1d4d747d,0x0b688394,0x12d5ca7a,0x0ef281a7,0x1b02efcf,
+        0x18580758,0x0f838a95,0x00f31c95 } },
+    /* 242 */
+    { { 0x09cc4597,0x07ac6a92,0x18280a30,0x002b6175,0x0814adc5,0x1e2ab9a5,
+        0x10ebbf17,0x1972dc2f,0x00013404 },
+      { 0x09a824bf,0x14f12c2e,0x07abb5ec,0x0630bc00,0x168acd59,0x134130f7,
+        0x19b235bb,0x09723267,0x006f377c } },
+    /* 243 */
+    { { 0x08333fd2,0x1c9dd68d,0x0aa56e27,0x060404b4,0x15acea89,0x081bf57b,
+        0x14188479,0x09da5a12,0x006dba3e },
+      { 0x104399cd,0x0477cc66,0x0dceb7a9,0x038cddcd,0x0caf3181,0x03a960bf,
+        0x129dcbd8,0x08477d9e,0x00f13cf3 } },
+    /* 244 */
+    { { 0x0919e2eb,0x175cf605,0x0b03da33,0x13432bec,0x0229983a,0x1ddb3d5d,
+        0x0b4f3ee8,0x1524e977,0x00c83fa9 },
+      { 0x02fa1ce0,0x0be8d85b,0x063befc3,0x16c1ea68,0x06f04e58,0x17cf2938,
+        0x1a0efea3,0x1e8bae04,0x00b49d70 } },
+    /* 245 */
+    { { 0x1ad5513b,0x0a63a887,0x1d478b64,0x065dd962,0x19d5905f,0x020c6cfd,
+        0x073db614,0x1761861e,0x0059cfad },
+      { 0x15cb7fd6,0x0b3d611a,0x0109a8f8,0x06cf7104,0x18864249,0x02c64853,
+        0x0d9fabbb,0x0c46a949,0x005babf3 } },
+    /* 246 */
+    { { 0x0e424865,0x1e4c0e8f,0x1955dfcd,0x0050f1e5,0x0c0588b0,0x1878dcf0,
+        0x03c1c0a5,0x14f204d9,0x006188c6 },
+      { 0x10f244da,0x17cd0cde,0x02021cc1,0x19dab9f6,0x136371ec,0x07cdcf90,
+        0x0764d51c,0x0ebbea17,0x00993fe4 } },
+    /* 247 */
+    { { 0x1b2c3609,0x0718e6fc,0x11b53a9a,0x16338058,0x1510184e,0x160d4d3b,
+        0x05adeb27,0x0cc9900c,0x0081f764 },
+      { 0x15fbe978,0x0be152d3,0x00ecd587,0x07fda7e3,0x1d2bf674,0x0f82280e,
+        0x18360e34,0x054bfd20,0x00564a81 } },
+    /* 248 */
+    { { 0x1a817d1d,0x12d327a7,0x0a0b83de,0x12d0897d,0x1f9aa55f,0x0d07e6ab,
+        0x15b2d7fd,0x19e01ca3,0x00226bf3 },
+      { 0x0f2833cf,0x168d4fc9,0x13e26a35,0x0146b49e,0x17f7720a,0x1624c79f,
+        0x00d8454d,0x08ffe4af,0x0068779f } },
+    /* 249 */
+    { { 0x13043d08,0x0d860e0b,0x10083e9e,0x08cee83f,0x126d0a54,0x1f144d36,
+        0x182f4dd9,0x1a3d6125,0x0097bcb0 },
+      { 0x132ed3c3,0x15b75547,0x006f120a,0x09e2a365,0x178f3c8a,0x1a79dfd0,
+        0x1955346f,0x1d014f08,0x00a872ff } },
+    /* 250 */
+    { { 0x032b2086,0x0d5bc9ad,0x183d21ac,0x16e21d02,0x0e6bee1e,0x06c89db5,
+        0x0daa6f43,0x1f96e654,0x0002812b },
+      { 0x0f605318,0x11febe56,0x1f5b4769,0x1cbaa1fb,0x0d619646,0x01cc1081,
+        0x1abe875a,0x193fca72,0x0007391c } },
+    /* 251 */
+    { { 0x0b80d02b,0x080abf84,0x01dfdff1,0x0667a2c5,0x142ae6b8,0x0d7c3c6a,
+        0x0821eb28,0x1b8fcda5,0x00355d2a },
+      { 0x087386e1,0x00f99ad1,0x190c9d6d,0x0e5529f1,0x189eafd2,0x1166f3cc,
+        0x09e4a1b2,0x1c6f8547,0x003dc2b1 } },
+    /* 252 */
+    { { 0x04581352,0x144e90e0,0x19e0afb5,0x01904a6e,0x1701f0a0,0x0ac84ff6,
+        0x11ac80ef,0x020799b0,0x00c47869 },
+      { 0x04c768ed,0x0dd3b841,0x107d95d7,0x1dd404d0,0x0ce0e72f,0x1f6ab566,
+        0x14c9ccc4,0x0d1ab769,0x00ccc429 } },
+    /* 253 */
+    { { 0x1d7620b9,0x07286f09,0x04a95aa5,0x14b914b3,0x087c9d89,0x1b2033aa,
+        0x073f7001,0x0855490e,0x00e147eb },
+      { 0x0cf3ae46,0x1a55a775,0x0d43ef89,0x126df6a0,0x040eafd4,0x1f23a464,
+        0x1b8f7cab,0x08e101d2,0x00239ac0 } },
+    /* 254 */
+    { { 0x0bfee8d4,0x00e8f9a9,0x1ec3fb12,0x016b9ff4,0x1af3cce8,0x064f1674,
+        0x16744171,0x147ebefc,0x00c55fa1 },
+      { 0x0257c227,0x0c378a74,0x0af802cc,0x02ca7e68,0x04fb2c5b,0x04cc5548,
+        0x1a6426bf,0x139a9e96,0x00094cd9 } },
+    /* 255 */
+    { { 0x1703beba,0x14c0e426,0x13aca462,0x03a2a065,0x149ec863,0x1964f1de,
+        0x14ce9117,0x16c85575,0x00b90a30 },
+      { 0x14a5abf9,0x032a027d,0x16dd80ed,0x0ea186eb,0x1d89f004,0x0166651a,
+        0x13ddbe69,0x13436f24,0x00019f8b } },
+};
+
+/* Multiply the base point of P256 by the scalar and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * Stripe implementation.
+ * Pre-generated: 2^0, 2^32, ...
+ * Pre-generated: products of all combinations of above.
+ * 8 doubles and adds (with qz=1)
+ *
+ * r     Resulting point.
+ * k     Scalar to multiply by.
+ * map   Indicates whether to convert result to affine.
+ * ct    Constant time required.
+ * heap  Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+static int sp_256_ecc_mulmod_base_9(sp_point_256* r, const sp_digit* k,
+        int map, int ct, void* heap)
+{
+    return sp_256_ecc_mulmod_stripe_9(r, &p256_base, p256_table,
+                                      k, map, ct, heap);
+}
+
+#endif
+
+/* Multiply the base point of P256 by the scalar and return the result.
+ * If map is true then convert result to affine coordinates.
+ *
+ * km    Scalar to multiply by.
+ * r     Resulting point.
+ * map   Indicates whether to convert result to affine.
+ * heap  Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+int sp_ecc_mulmod_base_256(const mp_int* km, ecc_point* r, int map, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* point = NULL;
+    sp_digit* k = NULL;
+#else
+    sp_point_256  point[1];
+    sp_digit k[9];
+#endif
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
+                                         DYNAMIC_TYPE_ECC);
+    if (point == NULL)
+        err = MEMORY_E;
+    if (err == MP_OKAY) {
+        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
+                               DYNAMIC_TYPE_ECC);
+        if (k == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        sp_256_from_mp(k, 9, km);
+
+            err = sp_256_ecc_mulmod_base_9(point, k, map, 1, heap);
+    }
+    if (err == MP_OKAY) {
+        err = sp_256_point_to_ecc_point_9(point, r);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (k != NULL)
+        XFREE(k, heap, DYNAMIC_TYPE_ECC);
+    if (point != NULL)
+        XFREE(point, heap, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+
+/* Multiply the base point of P256 by the scalar, add point a and return
+ * the result. If map is true then convert result to affine coordinates.
+ *
+ * km      Scalar to multiply by.
+ * am      Point to add to scalar mulitply result.
+ * inMont  Point to add is in montogmery form.
+ * r       Resulting point.
+ * map     Indicates whether to convert result to affine.
+ * heap    Heap to use for allocation.
+ * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am,
+        int inMont, ecc_point* r, int map, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* point = NULL;
+    sp_digit* k = NULL;
+#else
+    sp_point_256 point[2];
+    sp_digit k[9 + 9 * 2 * 5];
+#endif
+    sp_point_256* addP = NULL;
+    sp_digit* tmp = NULL;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, 
+                                         DYNAMIC_TYPE_ECC);
+    if (point == NULL)
+        err = MEMORY_E;
+    if (err == MP_OKAY) {
+        k = (sp_digit*)XMALLOC(
+            sizeof(sp_digit) * (9 + 9 * 2 * 5),
+            heap, DYNAMIC_TYPE_ECC);
+        if (k == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        addP = point + 1;
+        tmp = k + 9;
+
+        sp_256_from_mp(k, 9, km);
+        sp_256_point_from_ecc_point_9(addP, am);
+    }
+    if ((err == MP_OKAY) && (!inMont)) {
+        err = sp_256_mod_mul_norm_9(addP->x, addP->x, p256_mod);
+    }
+    if ((err == MP_OKAY) && (!inMont)) {
+        err = sp_256_mod_mul_norm_9(addP->y, addP->y, p256_mod);
+    }
+    if ((err == MP_OKAY) && (!inMont)) {
+        err = sp_256_mod_mul_norm_9(addP->z, addP->z, p256_mod);
+    }
+    if (err == MP_OKAY) {
+            err = sp_256_ecc_mulmod_base_9(point, k, 0, 0, heap);
+    }
+    if (err == MP_OKAY) {
+            sp_256_proj_point_add_9(point, point, addP, tmp);
+
+        if (map) {
+                sp_256_map_9(point, point, tmp);
+        }
+
+        err = sp_256_point_to_ecc_point_9(point, r);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (k != NULL)
+        XFREE(k, heap, DYNAMIC_TYPE_ECC);
+    if (point)
+        XFREE(point, heap, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+
+#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
+                                                        defined(HAVE_ECC_VERIFY)
+/* Returns 1 if the number of zero.
+ * Implementation is constant time.
+ *
+ * a  Number to check.
+ * returns 1 if the number is zero and 0 otherwise.
+ */
+static int sp_256_iszero_9(const sp_digit* a)
+{
+    return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7] |
+            a[8]) == 0;
+}
+
+#endif /* WOLFSSL_VALIDATE_ECC_KEYGEN | HAVE_ECC_SIGN | HAVE_ECC_VERIFY */
+/* Add 1 to a. (a = a + 1)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_256_add_one_9(sp_digit* a)
+{
+    a[0]++;
+    sp_256_norm_9(a);
+}
+
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = n-1; i >= 0; i--) {
+        r[j] |= (((sp_digit)a[i]) << s);
+        if (s >= 21U) {
+            r[j] &= 0x1fffffff;
+            s = 29U - s;
+            if (j + 1 >= size) {
+                break;
+            }
+            r[++j] = (sp_digit)a[i] >> s;
+            s = 8U - s;
+        }
+        else {
+            s += 8U;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+}
+
+/* Generates a scalar that is in the range 1..order-1.
+ *
+ * rng  Random number generator.
+ * k    Scalar value.
+ * returns RNG failures, MEMORY_E when memory allocation fails and
+ * MP_OKAY on success.
+ */
+static int sp_256_ecc_gen_k_9(WC_RNG* rng, sp_digit* k)
+{
+    int err;
+    byte buf[32];
+
+    do {
+        err = wc_RNG_GenerateBlock(rng, buf, sizeof(buf));
+        if (err == 0) {
+            sp_256_from_bin(k, 9, buf, (int)sizeof(buf));
+            if (sp_256_cmp_9(k, p256_order2) < 0) {
+                sp_256_add_one_9(k);
+                break;
+            }
+        }
+    }
+    while (err == 0);
+
+    return err;
+}
+
+/* Makes a random EC key pair.
+ *
+ * rng   Random number generator.
+ * priv  Generated private value.
+ * pub   Generated public point.
+ * heap  Heap to use for allocation.
+ * returns ECC_INF_E when the point does not have the correct order, RNG
+ * failures, MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* point = NULL;
+    sp_digit* k = NULL;
+#else
+    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
+    sp_point_256 point[2];
+    #else
+    sp_point_256 point[1];
+    #endif
+    sp_digit k[9];
+#endif
+#ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
+    sp_point_256* infinity = NULL;
+#endif
+    int err = MP_OKAY;
+    
+
+    (void)heap;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
+    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC);
+    #else
+    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC);    
+    #endif
+    if (point == NULL)
+        err = MEMORY_E;
+    if (err == MP_OKAY) {
+        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
+                               DYNAMIC_TYPE_ECC);
+        if (k == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
+        infinity = point + 1;
+    #endif
+
+        err = sp_256_ecc_gen_k_9(rng, k);
+    }
+    if (err == MP_OKAY) {
+            err = sp_256_ecc_mulmod_base_9(point, k, 1, 1, NULL);
+    }
+
+#ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
+    if (err == MP_OKAY) {
+            err = sp_256_ecc_mulmod_9(infinity, point, p256_order, 1, 1, NULL);
+    }
+    if (err == MP_OKAY) {
+        if (sp_256_iszero_9(point->x) || sp_256_iszero_9(point->y)) {
+            err = ECC_INF_E;
+        }
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        err = sp_256_to_mp(k, priv);
+    }
+    if (err == MP_OKAY) {
+        err = sp_256_point_to_ecc_point_9(point, pub);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (k != NULL)
+        XFREE(k, heap, DYNAMIC_TYPE_ECC);
+    if (point != NULL) {
+        /* point is not sensitive, so no need to zeroize */
+        XFREE(point, heap, DYNAMIC_TYPE_ECC);
+    }
+#endif
+
+    return err;
+}
+
+#ifdef HAVE_ECC_DHE
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 32
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_256_to_bin_9(sp_digit* r, byte* a)
+{
+    int i;
+    int j;
+    int s = 0;
+    int b;
+
+    for (i=0; i<8; i++) {
+        r[i+1] += r[i] >> 29;
+        r[i] &= 0x1fffffff;
+    }
+    j = 256 / 8 - 1;
+    a[j] = 0;
+    for (i=0; i<9 && j>=0; i++) {
+        b = 0;
+        /* lint allow cast of mismatch sp_digit and int */
+        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
+        b += 8 - s;
+        if (j < 0) {
+            break;
+        }
+        while (b < 29) {
+            a[j--] = (byte)(r[i] >> b);
+            b += 8;
+            if (j < 0) {
+                break;
+            }
+        }
+        s = 8 - (b - 29);
+        if (j >= 0) {
+            a[j] = 0;
+        }
+        if (s != 0) {
+            j++;
+        }
+    }
+}
+
+/* Multiply the point by the scalar and serialize the X ordinate.
+ * The number is 0 padded to maximum size on output.
+ *
+ * priv    Scalar to multiply the point by.
+ * pub     Point to multiply.
+ * out     Buffer to hold X ordinate.
+ * outLen  On entry, size of the buffer in bytes.
+ *         On exit, length of data in buffer in bytes.
+ * heap    Heap to use for allocation.
+ * returns BUFFER_E if the buffer is to small for output size,
+ * MEMORY_E when memory allocation fails and MP_OKAY on success.
+ */
+int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
+                          word32* outLen, void* heap)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_point_256* point = NULL;
+    sp_digit* k = NULL;
+#else
+    sp_point_256 point[1];
+    sp_digit k[9];
+#endif
+    int err = MP_OKAY;
+
+    if (*outLen < 32U) {
+        err = BUFFER_E;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
+                                         DYNAMIC_TYPE_ECC);
+        if (point == NULL)
+            err = MEMORY_E;
+    }
+    if (err == MP_OKAY) {
+        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
+                               DYNAMIC_TYPE_ECC);
+        if (k == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        sp_256_from_mp(k, 9, priv);
+        sp_256_point_from_ecc_point_9(point, pub);
+            err = sp_256_ecc_mulmod_9(point, point, k, 1, 1, heap);
+    }
+    if (err == MP_OKAY) {
+        sp_256_to_bin_9(point->x, out);
+        *outLen = 32;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (k != NULL)
+        XFREE(k, heap, DYNAMIC_TYPE_ECC);
+    if (point != NULL)
+        XFREE(point, heap, DYNAMIC_TYPE_ECC);
+#endif
+
+    return err;
+}
+#endif /* HAVE_ECC_DHE */
+
+#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
+#endif
+#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
+SP_NOINLINE static void sp_256_rshift_9(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+#ifdef WOLFSSL_SP_SMALL
+    for (i=0; i<8; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
+    }
+#else
+    for (i=0; i<0; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (29 - n)) & 0x1fffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (29 - n)) & 0x1fffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (29 - n)) & 0x1fffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (29 - n)) & 0x1fffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (29 - n)) & 0x1fffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (29 - n)) & 0x1fffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (29 - n)) & 0x1fffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (29 - n)) & 0x1fffffff);
+    }
+    r[0] = (a[0] >> n) | ((a[1] << (29 - n)) & 0x1fffffff);
+    r[1] = (a[1] >> n) | ((a[2] << (29 - n)) & 0x1fffffff);
+    r[2] = (a[2] >> n) | ((a[3] << (29 - n)) & 0x1fffffff);
+    r[3] = (a[3] >> n) | ((a[4] << (29 - n)) & 0x1fffffff);
+    r[4] = (a[4] >> n) | ((a[5] << (29 - n)) & 0x1fffffff);
+    r[5] = (a[5] >> n) | ((a[6] << (29 - n)) & 0x1fffffff);
+    r[6] = (a[6] >> n) | ((a[7] << (29 - n)) & 0x1fffffff);
+    r[7] = (a[7] >> n) | ((a[8] << (29 - n)) & 0x1fffffff);
+#endif /* WOLFSSL_SP_SMALL */
+    r[8] = a[8] >> n;
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_256_mul_d_9(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+#ifdef WOLFSSL_SP_SMALL
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 9; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x1fffffff);
+        t >>= 29;
+    }
+    r[9] = (sp_digit)t;
+#else
+    sp_int64 tb = b;
+    sp_int64 t[9];
+
+    t[ 0] = tb * a[ 0];
+    t[ 1] = tb * a[ 1];
+    t[ 2] = tb * a[ 2];
+    t[ 3] = tb * a[ 3];
+    t[ 4] = tb * a[ 4];
+    t[ 5] = tb * a[ 5];
+    t[ 6] = tb * a[ 6];
+    t[ 7] = tb * a[ 7];
+    t[ 8] = tb * a[ 8];
+    r[ 0] = (sp_digit)                 (t[ 0] & 0x1fffffff);
+    r[ 1] = (sp_digit)((t[ 0] >> 29) + (t[ 1] & 0x1fffffff));
+    r[ 2] = (sp_digit)((t[ 1] >> 29) + (t[ 2] & 0x1fffffff));
+    r[ 3] = (sp_digit)((t[ 2] >> 29) + (t[ 3] & 0x1fffffff));
+    r[ 4] = (sp_digit)((t[ 3] >> 29) + (t[ 4] & 0x1fffffff));
+    r[ 5] = (sp_digit)((t[ 4] >> 29) + (t[ 5] & 0x1fffffff));
+    r[ 6] = (sp_digit)((t[ 5] >> 29) + (t[ 6] & 0x1fffffff));
+    r[ 7] = (sp_digit)((t[ 6] >> 29) + (t[ 7] & 0x1fffffff));
+    r[ 8] = (sp_digit)((t[ 7] >> 29) + (t[ 8] & 0x1fffffff));
+    r[ 9] = (sp_digit) (t[ 8] >> 29);
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+SP_NOINLINE static void sp_256_lshift_18(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+
+    r[18] = a[17] >> (29 - n);
+    for (i=17; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (29 - n))) & 0x1fffffff;
+    }
+#else
+    sp_int_digit s;
+    sp_int_digit t;
+
+    s = (sp_int_digit)a[17];
+    r[18] = s >> (29U - n);
+    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
+    r[17] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
+    r[16] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
+    r[15] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
+    r[14] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
+    r[13] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
+    r[12] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
+    r[11] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
+    r[10] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
+    r[9] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
+    r[8] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
+    r[7] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
+    r[6] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
+    r[5] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
+    r[4] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
+    r[3] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
+    r[2] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
+    r[1] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
+#endif /* WOLFSSL_SP_SMALL */
+    r[0] = (a[0] << n) & 0x1fffffff;
+}
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Simplified based on top word of divisor being (1 << 29) - 1
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_256_div_9(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+    sp_digit r1;
+    sp_digit mask;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 9 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 9 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 18 + 1;
+        sd = t2 + 9 + 1;
+
+        sp_256_mul_d_9(sd, d, (sp_digit)1 << 5);
+        sp_256_lshift_18(t1, a, 5);
+        t1[9 + 9] += t1[9 + 9 - 1] >> 29;
+        t1[9 + 9 - 1] &= 0x1fffffff;
+        for (i=8; i>=0; i--) {
+            r1 = t1[9 + i];
+            sp_256_mul_d_9(t2, sd, r1);
+            (void)sp_256_sub_9(&t1[i], &t1[i], t2);
+            t1[9 + i] -= t2[9];
+            sp_256_norm_9(&t1[i + 1]);
+
+            mask = (sp_digit)0 - ((t1[9 + i] > 0) ?
+                    (sp_digit)1 : (sp_digit)0);
+            sp_256_cond_sub_9(t1 + i, t1 + i, sd, mask);
+            sp_256_norm_9(&t1[i + 1]);
+        }
+        sp_256_norm_9(t1);
+        sp_256_rshift_9(r, t1, 5);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_256_mod_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_256_div_9(a, m, NULL, r);
 }
 
 #endif
@@ -17835,10 +25217,10 @@ static const sp_int_digit p256_order_low[4] = {
  * a  First operand of the multiplication.
  * b  Second operand of the multiplication.
  */
-static void sp_256_mont_mul_order_10(sp_digit* r, const sp_digit* a, const sp_digit* b)
+static void sp_256_mont_mul_order_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
 {
-    sp_256_mul_10(r, a, b);
-    sp_256_mont_reduce_order_10(r, p256_order, p256_mp_order);
+    sp_256_mul_9(r, a, b);
+    sp_256_mont_reduce_order_9(r, p256_order, p256_mp_order);
 }
 
 /* Square number mod the order of P256 curve. (r = a * a mod order)
@@ -17846,10 +25228,10 @@ static void sp_256_mont_mul_order_10(sp_digit* r, const sp_digit* a, const sp_di
  * r  Result of the squaring.
  * a  Number to square.
  */
-static void sp_256_mont_sqr_order_10(sp_digit* r, const sp_digit* a)
+static void sp_256_mont_sqr_order_9(sp_digit* r, const sp_digit* a)
 {
-    sp_256_sqr_10(r, a);
-    sp_256_mont_reduce_order_10(r, p256_order, p256_mp_order);
+    sp_256_sqr_9(r, a);
+    sp_256_mont_reduce_order_9(r, p256_order, p256_mp_order);
 }
 
 #ifndef WOLFSSL_SP_SMALL
@@ -17859,13 +25241,13 @@ static void sp_256_mont_sqr_order_10(sp_digit* r, const sp_digit* a)
  * r  Result of the squaring.
  * a  Number to square.
  */
-static void sp_256_mont_sqr_n_order_10(sp_digit* r, const sp_digit* a, int n)
+static void sp_256_mont_sqr_n_order_9(sp_digit* r, const sp_digit* a, int n)
 {
     int i;
 
-    sp_256_mont_sqr_order_10(r, a);
+    sp_256_mont_sqr_order_9(r, a);
     for (i=1; i<n; i++) {
-        sp_256_mont_sqr_order_10(r, r);
+        sp_256_mont_sqr_order_9(r, r);
     }
 }
 #endif /* !WOLFSSL_SP_SMALL */
@@ -17879,38 +25261,38 @@ static void sp_256_mont_sqr_n_order_10(sp_digit* r, const sp_digit* a, int n)
  */
 
 #ifdef WOLFSSL_SP_NONBLOCK
-typedef struct sp_256_mont_inv_order_10_ctx {
+typedef struct sp_256_mont_inv_order_9_ctx {
     int state;
     int i;
-} sp_256_mont_inv_order_10_ctx;
-static int sp_256_mont_inv_order_10_nb(sp_ecc_ctx_t* sp_ctx, sp_digit* r, const sp_digit* a,
+} sp_256_mont_inv_order_9_ctx;
+static int sp_256_mont_inv_order_9_nb(sp_ecc_ctx_t* sp_ctx, sp_digit* r, const sp_digit* a,
         sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
-    sp_256_mont_inv_order_10_ctx* ctx = (sp_256_mont_inv_order_10_ctx*)sp_ctx;
+    sp_256_mont_inv_order_9_ctx* ctx = (sp_256_mont_inv_order_9_ctx*)sp_ctx;
 
-    typedef char ctx_size_test[sizeof(sp_256_mont_inv_order_10_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
+    typedef char ctx_size_test[sizeof(sp_256_mont_inv_order_9_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
     (void)sizeof(ctx_size_test);
 
     switch (ctx->state) {
     case 0:
-        XMEMCPY(t, a, sizeof(sp_digit) * 10);
+        XMEMCPY(t, a, sizeof(sp_digit) * 9);
         ctx->i = 254;
         ctx->state = 1;
         break;
     case 1:
-        sp_256_mont_sqr_order_10(t, t);
+        sp_256_mont_sqr_order_9(t, t);
         ctx->state = 2;
         break;
     case 2:
         if ((p256_order_minus_2[ctx->i / 32] & ((sp_int_digit)1 << (ctx->i % 32))) != 0) {
-            sp_256_mont_mul_order_10(t, t, a);
+            sp_256_mont_mul_order_9(t, t, a);
         }
         ctx->i--;
         ctx->state = (ctx->i == 0) ? 3 : 1;
         break;
     case 3:
-        XMEMCPY(r, t, sizeof(sp_digit) * 10U);
+        XMEMCPY(r, t, sizeof(sp_digit) * 9U);
         err = MP_OKAY;
         break;
     }
@@ -17918,96 +25300,96 @@ static int sp_256_mont_inv_order_10_nb(sp_ecc_ctx_t* sp_ctx, sp_digit* r, const
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_mont_inv_order_10(sp_digit* r, const sp_digit* a,
+static void sp_256_mont_inv_order_9(sp_digit* r, const sp_digit* a,
         sp_digit* td)
 {
 #ifdef WOLFSSL_SP_SMALL
     sp_digit* t = td;
     int i;
 
-    XMEMCPY(t, a, sizeof(sp_digit) * 10);
+    XMEMCPY(t, a, sizeof(sp_digit) * 9);
     for (i=254; i>=0; i--) {
-        sp_256_mont_sqr_order_10(t, t);
+        sp_256_mont_sqr_order_9(t, t);
         if ((p256_order_minus_2[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
-            sp_256_mont_mul_order_10(t, t, a);
+            sp_256_mont_mul_order_9(t, t, a);
         }
     }
-    XMEMCPY(r, t, sizeof(sp_digit) * 10U);
+    XMEMCPY(r, t, sizeof(sp_digit) * 9U);
 #else
     sp_digit* t = td;
-    sp_digit* t2 = td + 2 * 10;
-    sp_digit* t3 = td + 4 * 10;
+    sp_digit* t2 = td + 2 * 9;
+    sp_digit* t3 = td + 4 * 9;
     int i;
 
     /* t = a^2 */
-    sp_256_mont_sqr_order_10(t, a);
+    sp_256_mont_sqr_order_9(t, a);
     /* t = a^3 = t * a */
-    sp_256_mont_mul_order_10(t, t, a);
+    sp_256_mont_mul_order_9(t, t, a);
     /* t2= a^c = t ^ 2 ^ 2 */
-    sp_256_mont_sqr_n_order_10(t2, t, 2);
+    sp_256_mont_sqr_n_order_9(t2, t, 2);
     /* t3= a^f = t2 * t */
-    sp_256_mont_mul_order_10(t3, t2, t);
+    sp_256_mont_mul_order_9(t3, t2, t);
     /* t2= a^f0 = t3 ^ 2 ^ 4 */
-    sp_256_mont_sqr_n_order_10(t2, t3, 4);
+    sp_256_mont_sqr_n_order_9(t2, t3, 4);
     /* t = a^ff = t2 * t3 */
-    sp_256_mont_mul_order_10(t, t2, t3);
+    sp_256_mont_mul_order_9(t, t2, t3);
     /* t3= a^ff00 = t ^ 2 ^ 8 */
-    sp_256_mont_sqr_n_order_10(t2, t, 8);
+    sp_256_mont_sqr_n_order_9(t2, t, 8);
     /* t = a^ffff = t2 * t */
-    sp_256_mont_mul_order_10(t, t2, t);
+    sp_256_mont_mul_order_9(t, t2, t);
     /* t2= a^ffff0000 = t ^ 2 ^ 16 */
-    sp_256_mont_sqr_n_order_10(t2, t, 16);
+    sp_256_mont_sqr_n_order_9(t2, t, 16);
     /* t = a^ffffffff = t2 * t */
-    sp_256_mont_mul_order_10(t, t2, t);
+    sp_256_mont_mul_order_9(t, t2, t);
     /* t2= a^ffffffff0000000000000000 = t ^ 2 ^ 64  */
-    sp_256_mont_sqr_n_order_10(t2, t, 64);
+    sp_256_mont_sqr_n_order_9(t2, t, 64);
     /* t2= a^ffffffff00000000ffffffff = t2 * t */
-    sp_256_mont_mul_order_10(t2, t2, t);
+    sp_256_mont_mul_order_9(t2, t2, t);
     /* t2= a^ffffffff00000000ffffffff00000000 = t2 ^ 2 ^ 32  */
-    sp_256_mont_sqr_n_order_10(t2, t2, 32);
+    sp_256_mont_sqr_n_order_9(t2, t2, 32);
     /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */
-    sp_256_mont_mul_order_10(t2, t2, t);
+    sp_256_mont_mul_order_9(t2, t2, t);
     /* t2= a^ffffffff00000000ffffffffffffffffbce6 */
     for (i=127; i>=112; i--) {
-        sp_256_mont_sqr_order_10(t2, t2);
+        sp_256_mont_sqr_order_9(t2, t2);
         if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
-            sp_256_mont_mul_order_10(t2, t2, a);
+            sp_256_mont_mul_order_9(t2, t2, a);
         }
     }
     /* t2= a^ffffffff00000000ffffffffffffffffbce6f */
-    sp_256_mont_sqr_n_order_10(t2, t2, 4);
-    sp_256_mont_mul_order_10(t2, t2, t3);
+    sp_256_mont_sqr_n_order_9(t2, t2, 4);
+    sp_256_mont_mul_order_9(t2, t2, t3);
     /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84 */
     for (i=107; i>=64; i--) {
-        sp_256_mont_sqr_order_10(t2, t2);
+        sp_256_mont_sqr_order_9(t2, t2);
         if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
-            sp_256_mont_mul_order_10(t2, t2, a);
+            sp_256_mont_mul_order_9(t2, t2, a);
         }
     }
     /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f */
-    sp_256_mont_sqr_n_order_10(t2, t2, 4);
-    sp_256_mont_mul_order_10(t2, t2, t3);
+    sp_256_mont_sqr_n_order_9(t2, t2, 4);
+    sp_256_mont_mul_order_9(t2, t2, t3);
     /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2 */
     for (i=59; i>=32; i--) {
-        sp_256_mont_sqr_order_10(t2, t2);
+        sp_256_mont_sqr_order_9(t2, t2);
         if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
-            sp_256_mont_mul_order_10(t2, t2, a);
+            sp_256_mont_mul_order_9(t2, t2, a);
         }
     }
     /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2f */
-    sp_256_mont_sqr_n_order_10(t2, t2, 4);
-    sp_256_mont_mul_order_10(t2, t2, t3);
+    sp_256_mont_sqr_n_order_9(t2, t2, 4);
+    sp_256_mont_mul_order_9(t2, t2, t3);
     /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254 */
     for (i=27; i>=0; i--) {
-        sp_256_mont_sqr_order_10(t2, t2);
+        sp_256_mont_sqr_order_9(t2, t2);
         if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
-            sp_256_mont_mul_order_10(t2, t2, a);
+            sp_256_mont_mul_order_9(t2, t2, a);
         }
     }
     /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632540 */
-    sp_256_mont_sqr_n_order_10(t2, t2, 4);
+    sp_256_mont_sqr_n_order_9(t2, t2, 4);
     /* r = a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f */
-    sp_256_mont_mul_order_10(r, t2, t3);
+    sp_256_mont_mul_order_9(r, t2, t3);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
@@ -18029,41 +25411,41 @@ static void sp_256_mont_inv_order_10(sp_digit* r, const sp_digit* a,
  * tmp  Temporary storage for intermediate numbers.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_calc_s_10(sp_digit* s, const sp_digit* r, sp_digit* k,
+static int sp_256_calc_s_9(sp_digit* s, const sp_digit* r, sp_digit* k,
     sp_digit* x, const sp_digit* e, sp_digit* tmp)
 {
     int err;
     sp_digit carry;
-    int32_t c;
+    sp_int32 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
-        sp_256_mul_10(k, k, p256_norm_order);
-    err = sp_256_mod_10(k, k, p256_order);
+        sp_256_mul_9(k, k, p256_norm_order);
+    err = sp_256_mod_9(k, k, p256_order);
     if (err == MP_OKAY) {
-        sp_256_norm_10(k);
+        sp_256_norm_9(k);
 
         /* kInv = 1/k mod order */
-            sp_256_mont_inv_order_10(kInv, k, tmp);
-        sp_256_norm_10(kInv);
+            sp_256_mont_inv_order_9(kInv, k, tmp);
+        sp_256_norm_9(kInv);
 
         /* s = r * x + e */
-            sp_256_mul_10(x, x, r);
-        err = sp_256_mod_10(x, x, p256_order);
+            sp_256_mul_9(x, x, r);
+        err = sp_256_mod_9(x, x, p256_order);
     }
     if (err == MP_OKAY) {
-        sp_256_norm_10(x);
-        carry = sp_256_add_10(s, e, x);
-        sp_256_cond_sub_10(s, s, p256_order, 0 - carry);
-        sp_256_norm_10(s);
-        c = sp_256_cmp_10(s, p256_order);
-        sp_256_cond_sub_10(s, s, p256_order,
+        sp_256_norm_9(x);
+        carry = sp_256_add_9(s, e, x);
+        sp_256_cond_sub_9(s, s, p256_order, 0 - carry);
+        sp_256_norm_9(s);
+        c = sp_256_cmp_9(s, p256_order);
+        sp_256_cond_sub_9(s, s, p256_order,
             (sp_digit)0 - (sp_digit)(c >= 0));
-        sp_256_norm_10(s);
+        sp_256_norm_9(s);
 
         /* s = s * k^-1 mod order */
-            sp_256_mont_mul_order_10(s, s, kInv);
-        sp_256_norm_10(s);
+            sp_256_mont_mul_order_9(s, s, kInv);
+        sp_256_norm_9(s);
     }
 
     return err;
@@ -18089,14 +25471,14 @@ static int sp_256_calc_s_10(sp_digit* s, const sp_digit* r, sp_digit* k,
 typedef struct sp_ecc_sign_256_ctx {
     int state;
     union {
-        sp_256_ecc_mulmod_10_ctx mulmod_ctx;
-        sp_256_mont_inv_order_10_ctx mont_inv_order_ctx;
+        sp_256_ecc_mulmod_9_ctx mulmod_ctx;
+        sp_256_mont_inv_order_9_ctx mont_inv_order_ctx;
     };
-    sp_digit e[2*10];
-    sp_digit x[2*10];
-    sp_digit k[2*10];
-    sp_digit r[2*10];
-    sp_digit tmp[3 * 2*10];
+    sp_digit e[2*9];
+    sp_digit x[2*9];
+    sp_digit k[2*9];
+    sp_digit r[2*9];
+    sp_digit tmp[3 * 2*9];
     sp_point_256 point;
     sp_digit* s;
     sp_digit* kInv;
@@ -18128,17 +25510,17 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 1: /* GEN */
         /* New random point. */
         if (km == NULL || mp_iszero(km)) {
-            err = sp_256_ecc_gen_k_10(rng, ctx->k);
+            err = sp_256_ecc_gen_k_9(rng, ctx->k);
         }
         else {
-            sp_256_from_mp(ctx->k, 10, km);
+            sp_256_from_mp(ctx->k, 9, km);
             mp_zero(km);
         }
         XMEMSET(&ctx->mulmod_ctx, 0, sizeof(ctx->mulmod_ctx));
         ctx->state = 2;
         break;
     case 2: /* MULMOD */
-        err = sp_256_ecc_mulmod_10_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx,
+        err = sp_256_ecc_mulmod_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx,
             &ctx->point, &p256_base, ctx->k, 1, 1, heap);
         if (err == MP_OKAY) {
             ctx->state = 3;
@@ -18146,72 +25528,72 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int32_t c;
+        sp_int32 c;
         /* r = point->x mod order */
-        XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 10U);
-        sp_256_norm_10(ctx->r);
-        c = sp_256_cmp_10(ctx->r, p256_order);
-        sp_256_cond_sub_10(ctx->r, ctx->r, p256_order,
+        XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 9U);
+        sp_256_norm_9(ctx->r);
+        c = sp_256_cmp_9(ctx->r, p256_order);
+        sp_256_cond_sub_9(ctx->r, ctx->r, p256_order,
             (sp_digit)0 - (sp_digit)(c >= 0));
-        sp_256_norm_10(ctx->r);
+        sp_256_norm_9(ctx->r);
 
-        sp_256_from_mp(ctx->x, 10, priv);
-        sp_256_from_bin(ctx->e, 10, hash, (int)hashLen);
+        sp_256_from_mp(ctx->x, 9, priv);
+        sp_256_from_bin(ctx->e, 9, hash, (int)hashLen);
         ctx->state = 4;
         break;
     }
     case 4: /* KMODORDER */
         /* Conv k to Montgomery form (mod order) */
-        sp_256_mul_10(ctx->k, ctx->k, p256_norm_order);
-        err = sp_256_mod_10(ctx->k, ctx->k, p256_order);
+        sp_256_mul_9(ctx->k, ctx->k, p256_norm_order);
+        err = sp_256_mod_9(ctx->k, ctx->k, p256_order);
         if (err == MP_OKAY) {
-            sp_256_norm_10(ctx->k);
+            sp_256_norm_9(ctx->k);
             XMEMSET(&ctx->mont_inv_order_ctx, 0, sizeof(ctx->mont_inv_order_ctx));
             ctx->state = 5;
         }
         break;
     case 5: /* KINV */
         /* kInv = 1/k mod order */
-        err = sp_256_mont_inv_order_10_nb((sp_ecc_ctx_t*)&ctx->mont_inv_order_ctx, ctx->kInv, ctx->k, ctx->tmp);
+        err = sp_256_mont_inv_order_9_nb((sp_ecc_ctx_t*)&ctx->mont_inv_order_ctx, ctx->kInv, ctx->k, ctx->tmp);
         if (err == MP_OKAY) {
             XMEMSET(&ctx->mont_inv_order_ctx, 0, sizeof(ctx->mont_inv_order_ctx));
             ctx->state = 6;
         }
         break;
     case 6: /* KINVNORM */
-        sp_256_norm_10(ctx->kInv);
+        sp_256_norm_9(ctx->kInv);
         ctx->state = 7;
         break;
     case 7: /* R */
         /* s = r * x + e */
-        sp_256_mul_10(ctx->x, ctx->x, ctx->r);
+        sp_256_mul_9(ctx->x, ctx->x, ctx->r);
         ctx->state = 8;
         break;
     case 8: /* S1 */
-        err = sp_256_mod_10(ctx->x, ctx->x, p256_order);
+        err = sp_256_mod_9(ctx->x, ctx->x, p256_order);
         if (err == MP_OKAY)
             ctx->state = 9;
         break;
     case 9: /* S2 */
     {
         sp_digit carry;
-        int32_t c;
-        sp_256_norm_10(ctx->x);
-        carry = sp_256_add_10(ctx->s, ctx->e, ctx->x);
-        sp_256_cond_sub_10(ctx->s, ctx->s,
+        sp_int32 c;
+        sp_256_norm_9(ctx->x);
+        carry = sp_256_add_9(ctx->s, ctx->e, ctx->x);
+        sp_256_cond_sub_9(ctx->s, ctx->s,
             p256_order, 0 - carry);
-        sp_256_norm_10(ctx->s);
-        c = sp_256_cmp_10(ctx->s, p256_order);
-        sp_256_cond_sub_10(ctx->s, ctx->s, p256_order,
+        sp_256_norm_9(ctx->s);
+        c = sp_256_cmp_9(ctx->s, p256_order);
+        sp_256_cond_sub_9(ctx->s, ctx->s, p256_order,
             (sp_digit)0 - (sp_digit)(c >= 0));
-        sp_256_norm_10(ctx->s);
+        sp_256_norm_9(ctx->s);
 
         /* s = s * k^-1 mod order */
-        sp_256_mont_mul_order_10(ctx->s, ctx->s, ctx->kInv);
-        sp_256_norm_10(ctx->s);
+        sp_256_mont_mul_order_9(ctx->s, ctx->s, ctx->kInv);
+        sp_256_norm_9(ctx->s);
 
         /* Check that signature is usable. */
-        if (sp_256_iszero_10(ctx->s) == 0) {
+        if (sp_256_iszero_9(ctx->s) == 0) {
             ctx->state = 10;
             break;
         }
@@ -18239,11 +25621,11 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         err = FP_WOULDBLOCK;
     }
     if (err != FP_WOULDBLOCK) {
-        XMEMSET(ctx->e, 0, sizeof(sp_digit) * 2U * 10U);
-        XMEMSET(ctx->x, 0, sizeof(sp_digit) * 2U * 10U);
-        XMEMSET(ctx->k, 0, sizeof(sp_digit) * 2U * 10U);
-        XMEMSET(ctx->r, 0, sizeof(sp_digit) * 2U * 10U);
-        XMEMSET(ctx->tmp, 0, sizeof(sp_digit) * 3U * 2U * 10U);
+        XMEMSET(ctx->e, 0, sizeof(sp_digit) * 2U * 9U);
+        XMEMSET(ctx->x, 0, sizeof(sp_digit) * 2U * 9U);
+        XMEMSET(ctx->k, 0, sizeof(sp_digit) * 2U * 9U);
+        XMEMSET(ctx->r, 0, sizeof(sp_digit) * 2U * 9U);
+        XMEMSET(ctx->tmp, 0, sizeof(sp_digit) * 3U * 2U * 9U);
     }
 
     return err;
@@ -18257,7 +25639,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* e = NULL;
     sp_point_256* point = NULL;
 #else
-    sp_digit e[7 * 2 * 10];
+    sp_digit e[7 * 2 * 9];
     sp_point_256 point[1];
 #endif
     sp_digit* x = NULL;
@@ -18265,7 +25647,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int32_t c;
+    sp_int32 c;
     int err = MP_OKAY;
     int i;
 
@@ -18279,7 +25661,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
             err = MEMORY_E;
     }
     if (err == MP_OKAY) {
-        e = (sp_digit*)XMALLOC(sizeof(sp_digit) * 7 * 2 * 10, heap,
+        e = (sp_digit*)XMALLOC(sizeof(sp_digit) * 7 * 2 * 9, heap,
                                DYNAMIC_TYPE_ECC);
         if (e == NULL)
             err = MEMORY_E;
@@ -18287,10 +25669,10 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
 #endif
 
     if (err == MP_OKAY) {
-        x = e + 2 * 10;
-        k = e + 4 * 10;
-        r = e + 6 * 10;
-        tmp = e + 8 * 10;
+        x = e + 2 * 9;
+        k = e + 4 * 9;
+        r = e + 6 * 9;
+        tmp = e + 8 * 9;
         s = e;
 
         if (hashLen > 32U) {
@@ -18301,33 +25683,33 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     for (i = SP_ECC_MAX_SIG_GEN; err == MP_OKAY && i > 0; i--) {
         /* New random point. */
         if (km == NULL || mp_iszero(km)) {
-            err = sp_256_ecc_gen_k_10(rng, k);
+            err = sp_256_ecc_gen_k_9(rng, k);
         }
         else {
-            sp_256_from_mp(k, 10, km);
+            sp_256_from_mp(k, 9, km);
             mp_zero(km);
         }
         if (err == MP_OKAY) {
-                err = sp_256_ecc_mulmod_base_10(point, k, 1, 1, heap);
+                err = sp_256_ecc_mulmod_base_9(point, k, 1, 1, heap);
         }
 
         if (err == MP_OKAY) {
             /* r = point->x mod order */
-            XMEMCPY(r, point->x, sizeof(sp_digit) * 10U);
-            sp_256_norm_10(r);
-            c = sp_256_cmp_10(r, p256_order);
-            sp_256_cond_sub_10(r, r, p256_order,
+            XMEMCPY(r, point->x, sizeof(sp_digit) * 9U);
+            sp_256_norm_9(r);
+            c = sp_256_cmp_9(r, p256_order);
+            sp_256_cond_sub_9(r, r, p256_order,
                 (sp_digit)0 - (sp_digit)(c >= 0));
-            sp_256_norm_10(r);
+            sp_256_norm_9(r);
 
-            sp_256_from_mp(x, 10, priv);
-            sp_256_from_bin(e, 10, hash, (int)hashLen);
+            sp_256_from_mp(x, 9, priv);
+            sp_256_from_bin(e, 9, hash, (int)hashLen);
 
-            err = sp_256_calc_s_10(s, r, k, x, e, tmp);
+            err = sp_256_calc_s_9(s, r, k, x, e, tmp);
         }
 
         /* Check that signature is usable. */
-        if ((err == MP_OKAY) && (sp_256_iszero_10(s) == 0)) {
+        if ((err == MP_OKAY) && (sp_256_iszero_9(s) == 0)) {
             break;
         }
 #ifdef WOLFSSL_ECDSA_SET_K_ONE_LOOP
@@ -18350,7 +25732,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     if (e != NULL)
 #endif
     {
-        ForceZero(e, sizeof(sp_digit) * 7 * 2 * 10);
+        ForceZero(e, sizeof(sp_digit) * 7 * 2 * 9);
     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(e, heap, DYNAMIC_TYPE_ECC);
     #endif
@@ -18370,31 +25752,31 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
 #endif /* HAVE_ECC_SIGN */
 
 #ifndef WOLFSSL_SP_SMALL
-static const char sp_256_tab32_10[32] = {
+static const char sp_256_tab32_9[32] = {
      1, 10,  2, 11, 14, 22,  3, 30,
     12, 15, 17, 19, 23, 26,  4, 31,
      9, 13, 21, 29, 16, 18, 25,  8,
     20, 28, 24,  7, 27,  6,  5, 32};
 
-static int sp_256_num_bits_26_10(sp_digit v)
+static int sp_256_num_bits_29_9(sp_digit v)
 {
     v |= v >> 1;
     v |= v >> 2;
     v |= v >> 4;
     v |= v >> 8;
     v |= v >> 16;
-    return sp_256_tab32_10[(uint32_t)(v*0x07C4ACDD) >> 27];
+    return sp_256_tab32_9[(uint32_t)(v*0x07C4ACDD) >> 27];
 }
 
-static int sp_256_num_bits_10(const sp_digit* a)
+static int sp_256_num_bits_9(const sp_digit* a)
 {
     int i;
     int r = 0;
 
-    for (i = 9; i >= 0; i--) {
+    for (i = 8; i >= 0; i--) {
         if (a[i] != 0) {
-            r = sp_256_num_bits_26_10(a[i]);
-            r += i * 26;
+            r = sp_256_num_bits_29_9(a[i]);
+            r += i * 29;
             break;
         }
     }
@@ -18410,13 +25792,13 @@ static int sp_256_num_bits_10(const sp_digit* a)
  * @return  MP_OKAY on success.
  * @return  MEMEORY_E when dynamic memory allocation fails.
  */
-static int sp_256_mod_inv_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static int sp_256_mod_inv_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
     int err = MP_OKAY;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* u = NULL;
 #else
-    sp_digit u[10 * 4];
+    sp_digit u[9 * 4];
 #endif
     sp_digit* v = NULL;
     sp_digit* b = NULL;
@@ -18425,82 +25807,82 @@ static int sp_256_mod_inv_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
     int vt;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    u = (sp_digit*)XMALLOC(sizeof(sp_digit) * 10 * 4, NULL,
+    u = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9 * 4, NULL,
                                                               DYNAMIC_TYPE_ECC);
     if (u == NULL)
         err = MEMORY_E;
 #endif
 
     if (err == MP_OKAY) {
-        v = u + 10;
-        b = u + 2 * 10;
-        d = u + 3 * 10;
+        v = u + 9;
+        b = u + 2 * 9;
+        d = u + 3 * 9;
 
-        XMEMCPY(u, m, sizeof(sp_digit) * 10);
-        XMEMCPY(v, a, sizeof(sp_digit) * 10);
+        XMEMCPY(u, m, sizeof(sp_digit) * 9);
+        XMEMCPY(v, a, sizeof(sp_digit) * 9);
 
-        ut = sp_256_num_bits_10(u);
-        vt = sp_256_num_bits_10(v);
+        ut = sp_256_num_bits_9(u);
+        vt = sp_256_num_bits_9(v);
 
-        XMEMSET(b, 0, sizeof(sp_digit) * 10);
+        XMEMSET(b, 0, sizeof(sp_digit) * 9);
         if ((v[0] & 1) == 0) {
-            sp_256_rshift1_10(v, v);
-            XMEMCPY(d, m, sizeof(sp_digit) * 10);
+            sp_256_rshift1_9(v, v);
+            XMEMCPY(d, m, sizeof(sp_digit) * 9);
             d[0]++;
-            sp_256_rshift1_10(d, d);
+            sp_256_rshift1_9(d, d);
             vt--;
 
             while ((v[0] & 1) == 0) {
-                sp_256_rshift1_10(v, v);
+                sp_256_rshift1_9(v, v);
                 if (d[0] & 1)
-                    sp_256_add_10(d, d, m);
-                sp_256_rshift1_10(d, d);
+                    sp_256_add_9(d, d, m);
+                sp_256_rshift1_9(d, d);
                 vt--;
             }
         }
         else {
-            XMEMSET(d+1, 0, sizeof(sp_digit) * (10 - 1));
+            XMEMSET(d+1, 0, sizeof(sp_digit) * (9 - 1));
             d[0] = 1;
         }
 
         while (ut > 1 && vt > 1) {
             if (ut > vt || (ut == vt &&
-                                       sp_256_cmp_10(u, v) >= 0)) {
-                sp_256_sub_10(u, u, v);
-                sp_256_norm_10(u);
+                                       sp_256_cmp_9(u, v) >= 0)) {
+                sp_256_sub_9(u, u, v);
+                sp_256_norm_9(u);
 
-                sp_256_sub_10(b, b, d);
-                sp_256_norm_10(b);
-                if (b[9] < 0)
-                    sp_256_add_10(b, b, m);
-                sp_256_norm_10(b);
-                ut = sp_256_num_bits_10(u);
+                sp_256_sub_9(b, b, d);
+                sp_256_norm_9(b);
+                if (b[8] < 0)
+                    sp_256_add_9(b, b, m);
+                sp_256_norm_9(b);
+                ut = sp_256_num_bits_9(u);
 
                 do {
-                    sp_256_rshift1_10(u, u);
+                    sp_256_rshift1_9(u, u);
                     if (b[0] & 1)
-                        sp_256_add_10(b, b, m);
-                    sp_256_rshift1_10(b, b);
+                        sp_256_add_9(b, b, m);
+                    sp_256_rshift1_9(b, b);
                     ut--;
                 }
                 while (ut > 0 && (u[0] & 1) == 0);
             }
             else {
-                sp_256_sub_10(v, v, u);
-                sp_256_norm_10(v);
+                sp_256_sub_9(v, v, u);
+                sp_256_norm_9(v);
 
-                sp_256_sub_10(d, d, b);
-                sp_256_norm_10(d);
-                if (d[9] < 0)
-                    sp_256_add_10(d, d, m);
-                sp_256_norm_10(d);
-                vt = sp_256_num_bits_10(v);
+                sp_256_sub_9(d, d, b);
+                sp_256_norm_9(d);
+                if (d[8] < 0)
+                    sp_256_add_9(d, d, m);
+                sp_256_norm_9(d);
+                vt = sp_256_num_bits_9(v);
 
                 do {
-                    sp_256_rshift1_10(v, v);
+                    sp_256_rshift1_9(v, v);
                     if (d[0] & 1)
-                        sp_256_add_10(d, d, m);
-                    sp_256_rshift1_10(d, d);
+                        sp_256_add_9(d, d, m);
+                    sp_256_rshift1_9(d, d);
                     vt--;
                 }
                 while (vt > 0 && (v[0] & 1) == 0);
@@ -18508,9 +25890,9 @@ static int sp_256_mod_inv_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
         }
 
         if (ut == 1)
-            XMEMCPY(r, b, sizeof(sp_digit) * 10);
+            XMEMCPY(r, b, sizeof(sp_digit) * 9);
         else
-            XMEMCPY(r, d, sizeof(sp_digit) * 10);
+            XMEMCPY(r, d, sizeof(sp_digit) * 9);
     }
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (u != NULL)
@@ -18528,14 +25910,14 @@ static int sp_256_mod_inv_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p2   Second point to add.
  * tmp  Temporary storage for intermediate numbers.
  */
-static void sp_256_add_points_10(sp_point_256* p1, const sp_point_256* p2,
+static void sp_256_add_points_9(sp_point_256* p1, const sp_point_256* p2,
     sp_digit* tmp)
 {
 
-        sp_256_proj_point_add_10(p1, p1, p2, tmp);
-    if (sp_256_iszero_10(p1->z)) {
-        if (sp_256_iszero_10(p1->x) && sp_256_iszero_10(p1->y)) {
-                sp_256_proj_point_dbl_10(p1, p2, tmp);
+        sp_256_proj_point_add_9(p1, p1, p2, tmp);
+    if (sp_256_iszero_9(p1->z)) {
+        if (sp_256_iszero_9(p1->x) && sp_256_iszero_9(p1->y)) {
+                sp_256_proj_point_dbl_9(p1, p2, tmp);
         }
         else {
             /* Y ordinate is not used from here - don't set. */
@@ -18548,7 +25930,6 @@ static void sp_256_add_points_10(sp_point_256* p1, const sp_point_256* p2,
             p1->x[6] = 0;
             p1->x[7] = 0;
             p1->x[8] = 0;
-            p1->x[9] = 0;
             XMEMCPY(p1->z, p256_norm_mod, sizeof(p256_norm_mod));
         }
     }
@@ -18564,50 +25945,50 @@ static void sp_256_add_points_10(sp_point_256* p1, const sp_point_256* p2,
  * heap  Heap to use for allocation.
  * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
  */
-static int sp_256_calc_vfy_point_10(sp_point_256* p1, sp_point_256* p2,
+static int sp_256_calc_vfy_point_9(sp_point_256* p1, sp_point_256* p2,
     sp_digit* s, sp_digit* u1, sp_digit* u2, sp_digit* tmp, void* heap)
 {
     int err;
 
 #ifndef WOLFSSL_SP_SMALL
     {
-        sp_256_mod_inv_10(s, s, p256_order);
+        sp_256_mod_inv_9(s, s, p256_order);
     }
 #endif /* !WOLFSSL_SP_SMALL */
     {
-        sp_256_mul_10(s, s, p256_norm_order);
+        sp_256_mul_9(s, s, p256_norm_order);
     }
-    err = sp_256_mod_10(s, s, p256_order);
+    err = sp_256_mod_9(s, s, p256_order);
     if (err == MP_OKAY) {
-        sp_256_norm_10(s);
+        sp_256_norm_9(s);
 #ifdef WOLFSSL_SP_SMALL
         {
-            sp_256_mont_inv_order_10(s, s, tmp);
-            sp_256_mont_mul_order_10(u1, u1, s);
-            sp_256_mont_mul_order_10(u2, u2, s);
+            sp_256_mont_inv_order_9(s, s, tmp);
+            sp_256_mont_mul_order_9(u1, u1, s);
+            sp_256_mont_mul_order_9(u2, u2, s);
         }
 
 #else
         {
-            sp_256_mont_mul_order_10(u1, u1, s);
-            sp_256_mont_mul_order_10(u2, u2, s);
+            sp_256_mont_mul_order_9(u1, u1, s);
+            sp_256_mont_mul_order_9(u2, u2, s);
         }
 
 #endif /* WOLFSSL_SP_SMALL */
-            err = sp_256_ecc_mulmod_base_10(p1, u1, 0, 0, heap);
+            err = sp_256_ecc_mulmod_base_9(p1, u1, 0, 0, heap);
     }
-    if ((err == MP_OKAY) && sp_256_iszero_10(p1->z)) {
+    if ((err == MP_OKAY) && sp_256_iszero_9(p1->z)) {
         p1->infinity = 1;
     }
     if (err == MP_OKAY) {
-            err = sp_256_ecc_mulmod_10(p2, p2, u2, 0, 0, heap);
+            err = sp_256_ecc_mulmod_9(p2, p2, u2, 0, 0, heap);
     }
-    if ((err == MP_OKAY) && sp_256_iszero_10(p2->z)) {
+    if ((err == MP_OKAY) && sp_256_iszero_9(p2->z)) {
         p2->infinity = 1;
     }
 
     if (err == MP_OKAY) {
-        sp_256_add_points_10(p1, p2, tmp);
+        sp_256_add_points_9(p1, p2, tmp);
     }
 
     return err;
@@ -18637,15 +26018,15 @@ static int sp_256_calc_vfy_point_10(sp_point_256* p1, sp_point_256* p2,
 typedef struct sp_ecc_verify_256_ctx {
     int state;
     union {
-        sp_256_ecc_mulmod_10_ctx mulmod_ctx;
-        sp_256_mont_inv_order_10_ctx mont_inv_order_ctx;
-        sp_256_proj_point_dbl_10_ctx dbl_ctx;
-        sp_256_proj_point_add_10_ctx add_ctx;
+        sp_256_ecc_mulmod_9_ctx mulmod_ctx;
+        sp_256_mont_inv_order_9_ctx mont_inv_order_ctx;
+        sp_256_proj_point_dbl_9_ctx dbl_ctx;
+        sp_256_proj_point_add_9_ctx add_ctx;
     };
-    sp_digit u1[2*10];
-    sp_digit u2[2*10];
-    sp_digit s[2*10];
-    sp_digit tmp[2*10 * 5];
+    sp_digit u1[2*9];
+    sp_digit u2[2*9];
+    sp_digit s[2*9];
+    sp_digit tmp[2*9 * 5];
     sp_point_256 p1;
     sp_point_256 p2;
 } sp_ecc_verify_256_ctx;
@@ -18666,44 +26047,44 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
             hashLen = 32U;
         }
 
-        sp_256_from_bin(ctx->u1, 10, hash, (int)hashLen);
-        sp_256_from_mp(ctx->u2, 10, rm);
-        sp_256_from_mp(ctx->s, 10, sm);
-        sp_256_from_mp(ctx->p2.x, 10, pX);
-        sp_256_from_mp(ctx->p2.y, 10, pY);
-        sp_256_from_mp(ctx->p2.z, 10, pZ);
+        sp_256_from_bin(ctx->u1, 9, hash, (int)hashLen);
+        sp_256_from_mp(ctx->u2, 9, rm);
+        sp_256_from_mp(ctx->s, 9, sm);
+        sp_256_from_mp(ctx->p2.x, 9, pX);
+        sp_256_from_mp(ctx->p2.y, 9, pY);
+        sp_256_from_mp(ctx->p2.z, 9, pZ);
         ctx->state = 1;
         break;
     case 1: /* NORMS0 */
-        sp_256_mul_10(ctx->s, ctx->s, p256_norm_order);
-        err = sp_256_mod_10(ctx->s, ctx->s, p256_order);
+        sp_256_mul_9(ctx->s, ctx->s, p256_norm_order);
+        err = sp_256_mod_9(ctx->s, ctx->s, p256_order);
         if (err == MP_OKAY)
             ctx->state = 2;
         break;
     case 2: /* NORMS1 */
-        sp_256_norm_10(ctx->s);
+        sp_256_norm_9(ctx->s);
         XMEMSET(&ctx->mont_inv_order_ctx, 0, sizeof(ctx->mont_inv_order_ctx));
         ctx->state = 3;
         break;
     case 3: /* NORMS2 */
-        err = sp_256_mont_inv_order_10_nb((sp_ecc_ctx_t*)&ctx->mont_inv_order_ctx, ctx->s, ctx->s, ctx->tmp);
+        err = sp_256_mont_inv_order_9_nb((sp_ecc_ctx_t*)&ctx->mont_inv_order_ctx, ctx->s, ctx->s, ctx->tmp);
         if (err == MP_OKAY) {
             ctx->state = 4;
         }
         break;
     case 4: /* NORMS3 */
-        sp_256_mont_mul_order_10(ctx->u1, ctx->u1, ctx->s);
+        sp_256_mont_mul_order_9(ctx->u1, ctx->u1, ctx->s);
         ctx->state = 5;
         break;
     case 5: /* NORMS4 */
-        sp_256_mont_mul_order_10(ctx->u2, ctx->u2, ctx->s);
+        sp_256_mont_mul_order_9(ctx->u2, ctx->u2, ctx->s);
         XMEMSET(&ctx->mulmod_ctx, 0, sizeof(ctx->mulmod_ctx));
         ctx->state = 6;
         break;
     case 6: /* MULBASE */
-        err = sp_256_ecc_mulmod_10_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx, &ctx->p1, &p256_base, ctx->u1, 0, 0, heap);
+        err = sp_256_ecc_mulmod_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx, &ctx->p1, &p256_base, ctx->u1, 0, 0, heap);
         if (err == MP_OKAY) {
-            if (sp_256_iszero_10(ctx->p1.z)) {
+            if (sp_256_iszero_9(ctx->p1.z)) {
                 ctx->p1.infinity = 1;
             }
             XMEMSET(&ctx->mulmod_ctx, 0, sizeof(ctx->mulmod_ctx));
@@ -18711,9 +26092,9 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         }
         break;
     case 7: /* MULMOD */
-        err = sp_256_ecc_mulmod_10_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx, &ctx->p2, &ctx->p2, ctx->u2, 0, 0, heap);
+        err = sp_256_ecc_mulmod_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx, &ctx->p2, &ctx->p2, ctx->u2, 0, 0, heap);
         if (err == MP_OKAY) {
-            if (sp_256_iszero_10(ctx->p2.z)) {
+            if (sp_256_iszero_9(ctx->p2.z)) {
                 ctx->p2.infinity = 1;
             }
             XMEMSET(&ctx->add_ctx, 0, sizeof(ctx->add_ctx));
@@ -18721,54 +26102,54 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         }
         break;
     case 8: /* ADD */
-        err = sp_256_proj_point_add_10_nb((sp_ecc_ctx_t*)&ctx->add_ctx, &ctx->p1, &ctx->p1, &ctx->p2, ctx->tmp);
+        err = sp_256_proj_point_add_9_nb((sp_ecc_ctx_t*)&ctx->add_ctx, &ctx->p1, &ctx->p1, &ctx->p2, ctx->tmp);
         if (err == MP_OKAY)
             ctx->state = 9;
         break;
     case 9: /* MONT */
         /* (r + n*order).z'.z' mod prime == (u1.G + u2.Q)->x' */
         /* Reload r and convert to Montgomery form. */
-        sp_256_from_mp(ctx->u2, 10, rm);
-        err = sp_256_mod_mul_norm_10(ctx->u2, ctx->u2, p256_mod);
+        sp_256_from_mp(ctx->u2, 9, rm);
+        err = sp_256_mod_mul_norm_9(ctx->u2, ctx->u2, p256_mod);
         if (err == MP_OKAY)
             ctx->state = 10;
         break;
     case 10: /* SQR */
         /* u1 = r.z'.z' mod prime */
-        sp_256_mont_sqr_10(ctx->p1.z, ctx->p1.z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_9(ctx->p1.z, ctx->p1.z, p256_mod, p256_mp_mod);
         ctx->state = 11;
         break;
     case 11: /* MUL */
-        sp_256_mont_mul_10(ctx->u1, ctx->u2, ctx->p1.z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(ctx->u1, ctx->u2, ctx->p1.z, p256_mod, p256_mp_mod);
         ctx->state = 12;
         break;
     case 12: /* RES */
     {
-        int32_t c = 0;
+        sp_int32 c = 0;
         err = MP_OKAY; /* math okay, now check result */
-        *res = (int)(sp_256_cmp_10(ctx->p1.x, ctx->u1) == 0);
+        *res = (int)(sp_256_cmp_9(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
             sp_digit carry;
 
             /* Reload r and add order. */
-            sp_256_from_mp(ctx->u2, 10, rm);
-            carry = sp_256_add_10(ctx->u2, ctx->u2, p256_order);
+            sp_256_from_mp(ctx->u2, 9, rm);
+            carry = sp_256_add_9(ctx->u2, ctx->u2, p256_order);
             /* Carry means result is greater than mod and is not valid. */
             if (carry == 0) {
-                sp_256_norm_10(ctx->u2);
+                sp_256_norm_9(ctx->u2);
 
                 /* Compare with mod and if greater or equal then not valid. */
-                c = sp_256_cmp_10(ctx->u2, p256_mod);
+                c = sp_256_cmp_9(ctx->u2, p256_mod);
             }
         }
         if ((*res == 0) && (c < 0)) {
             /* Convert to Montogomery form */
-            err = sp_256_mod_mul_norm_10(ctx->u2, ctx->u2, p256_mod);
+            err = sp_256_mod_mul_norm_9(ctx->u2, ctx->u2, p256_mod);
             if (err == MP_OKAY) {
                 /* u1 = (r + 1*order).z'.z' mod prime */
-                sp_256_mont_mul_10(ctx->u1, ctx->u2, ctx->p1.z, p256_mod,
+                sp_256_mont_mul_9(ctx->u1, ctx->u2, ctx->p1.z, p256_mod,
                                                             p256_mp_mod);
-                *res = (int)(sp_256_cmp_10(ctx->p1.x, ctx->u1) == 0);
+                *res = (int)(sp_256_cmp_9(ctx->p1.x, ctx->u1) == 0);
             }
         }
         break;
@@ -18791,7 +26172,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* u1 = NULL;
     sp_point_256* p1 = NULL;
 #else
-    sp_digit  u1[16 * 10];
+    sp_digit  u1[16 * 9];
     sp_point_256 p1[2];
 #endif
     sp_digit* u2 = NULL;
@@ -18799,7 +26180,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_256* p2 = NULL;
     sp_digit carry;
-    int32_t c = 0;
+    sp_int32 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -18810,7 +26191,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
             err = MEMORY_E;
     }
     if (err == MP_OKAY) {
-        u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 10, heap,
+        u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 9, heap,
                                                               DYNAMIC_TYPE_ECC);
         if (u1 == NULL)
             err = MEMORY_E;
@@ -18818,56 +26199,56 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
 #endif
 
     if (err == MP_OKAY) {
-        u2  = u1 + 2 * 10;
-        s   = u1 + 4 * 10;
-        tmp = u1 + 6 * 10;
+        u2  = u1 + 2 * 9;
+        s   = u1 + 4 * 9;
+        tmp = u1 + 6 * 9;
         p2 = p1 + 1;
 
         if (hashLen > 32U) {
             hashLen = 32U;
         }
 
-        sp_256_from_bin(u1, 10, hash, (int)hashLen);
-        sp_256_from_mp(u2, 10, rm);
-        sp_256_from_mp(s, 10, sm);
-        sp_256_from_mp(p2->x, 10, pX);
-        sp_256_from_mp(p2->y, 10, pY);
-        sp_256_from_mp(p2->z, 10, pZ);
+        sp_256_from_bin(u1, 9, hash, (int)hashLen);
+        sp_256_from_mp(u2, 9, rm);
+        sp_256_from_mp(s, 9, sm);
+        sp_256_from_mp(p2->x, 9, pX);
+        sp_256_from_mp(p2->y, 9, pY);
+        sp_256_from_mp(p2->z, 9, pZ);
 
-        err = sp_256_calc_vfy_point_10(p1, p2, s, u1, u2, tmp, heap);
+        err = sp_256_calc_vfy_point_9(p1, p2, s, u1, u2, tmp, heap);
     }
     if (err == MP_OKAY) {
         /* (r + n*order).z'.z' mod prime == (u1.G + u2.Q)->x' */
         /* Reload r and convert to Montgomery form. */
-        sp_256_from_mp(u2, 10, rm);
-        err = sp_256_mod_mul_norm_10(u2, u2, p256_mod);
+        sp_256_from_mp(u2, 9, rm);
+        err = sp_256_mod_mul_norm_9(u2, u2, p256_mod);
     }
 
     if (err == MP_OKAY) {
         /* u1 = r.z'.z' mod prime */
-        sp_256_mont_sqr_10(p1->z, p1->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_10(u1, u2, p1->z, p256_mod, p256_mp_mod);
-        *res = (int)(sp_256_cmp_10(p1->x, u1) == 0);
+        sp_256_mont_sqr_9(p1->z, p1->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(u1, u2, p1->z, p256_mod, p256_mp_mod);
+        *res = (int)(sp_256_cmp_9(p1->x, u1) == 0);
         if (*res == 0) {
             /* Reload r and add order. */
-            sp_256_from_mp(u2, 10, rm);
-            carry = sp_256_add_10(u2, u2, p256_order);
+            sp_256_from_mp(u2, 9, rm);
+            carry = sp_256_add_9(u2, u2, p256_order);
             /* Carry means result is greater than mod and is not valid. */
             if (carry == 0) {
-                sp_256_norm_10(u2);
+                sp_256_norm_9(u2);
 
                 /* Compare with mod and if greater or equal then not valid. */
-                c = sp_256_cmp_10(u2, p256_mod);
+                c = sp_256_cmp_9(u2, p256_mod);
             }
         }
         if ((*res == 0) && (c < 0)) {
             /* Convert to Montogomery form */
-            err = sp_256_mod_mul_norm_10(u2, u2, p256_mod);
+            err = sp_256_mod_mul_norm_9(u2, u2, p256_mod);
             if (err == MP_OKAY) {
                 /* u1 = (r + 1*order).z'.z' mod prime */
-                sp_256_mont_mul_10(u1, u2, p1->z, p256_mod,
+                sp_256_mont_mul_9(u1, u2, p1->z, p256_mod,
                     p256_mp_mod);
-                *res = (sp_256_cmp_10(p1->x, u1) == 0);
+                *res = (sp_256_cmp_9(p1->x, u1) == 0);
             }
         }
     }
@@ -18891,41 +26272,41 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
  * returns MEMORY_E if dynamic memory allocation fails, MP_VAL if the point is
  * not on the curve and MP_OKAY otherwise.
  */
-static int sp_256_ecc_is_point_10(const sp_point_256* point,
+static int sp_256_ecc_is_point_9(const sp_point_256* point,
     void* heap)
 {
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[10 * 4];
+    sp_digit t1[9 * 4];
 #endif
     sp_digit* t2 = NULL;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 10 * 4, heap, DYNAMIC_TYPE_ECC);
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9 * 4, heap, DYNAMIC_TYPE_ECC);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
     (void)heap;
 
     if (err == MP_OKAY) {
-        t2 = t1 + 2 * 10;
+        t2 = t1 + 2 * 9;
 
-        sp_256_sqr_10(t1, point->y);
-        (void)sp_256_mod_10(t1, t1, p256_mod);
-        sp_256_sqr_10(t2, point->x);
-        (void)sp_256_mod_10(t2, t2, p256_mod);
-        sp_256_mul_10(t2, t2, point->x);
-        (void)sp_256_mod_10(t2, t2, p256_mod);
-        (void)sp_256_sub_10(t2, p256_mod, t2);
-        sp_256_mont_add_10(t1, t1, t2, p256_mod);
+        sp_256_sqr_9(t1, point->y);
+        (void)sp_256_mod_9(t1, t1, p256_mod);
+        sp_256_sqr_9(t2, point->x);
+        (void)sp_256_mod_9(t2, t2, p256_mod);
+        sp_256_mul_9(t2, t2, point->x);
+        (void)sp_256_mod_9(t2, t2, p256_mod);
+        (void)sp_256_sub_9(t2, p256_mod, t2);
+        sp_256_mont_add_9(t1, t1, t2, p256_mod);
 
-        sp_256_mont_add_10(t1, t1, point->x, p256_mod);
-        sp_256_mont_add_10(t1, t1, point->x, p256_mod);
-        sp_256_mont_add_10(t1, t1, point->x, p256_mod);
+        sp_256_mont_add_9(t1, t1, point->x, p256_mod);
+        sp_256_mont_add_9(t1, t1, point->x, p256_mod);
+        sp_256_mont_add_9(t1, t1, point->x, p256_mod);
 
-        if (sp_256_cmp_10(t1, p256_b) != 0) {
+        if (sp_256_cmp_9(t1, p256_b) != 0) {
             err = MP_VAL;
         }
     }
@@ -18963,11 +26344,11 @@ int sp_ecc_is_point_256(const mp_int* pX, const mp_int* pY)
 #endif
 
     if (err == MP_OKAY) {
-        sp_256_from_mp(pub->x, 10, pX);
-        sp_256_from_mp(pub->y, 10, pY);
-        sp_256_from_bin(pub->z, 10, one, (int)sizeof(one));
+        sp_256_from_mp(pub->x, 9, pX);
+        sp_256_from_mp(pub->y, 9, pY);
+        sp_256_from_bin(pub->z, 9, one, (int)sizeof(one));
 
-        err = sp_256_ecc_is_point_10(pub, NULL);
+        err = sp_256_ecc_is_point_9(pub, NULL);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -18996,7 +26377,7 @@ int sp_ecc_check_key_256(const mp_int* pX, const mp_int* pY,
     sp_digit* priv = NULL;
     sp_point_256* pub = NULL;
 #else
-    sp_digit priv[10];
+    sp_digit priv[9];
     sp_point_256 pub[2];
 #endif
     sp_point_256* p = NULL;
@@ -19021,7 +26402,7 @@ int sp_ecc_check_key_256(const mp_int* pX, const mp_int* pY,
             err = MEMORY_E;
     }
     if (err == MP_OKAY && privm) {
-        priv = (sp_digit*)XMALLOC(sizeof(sp_digit) * 10, heap,
+        priv = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
                                   DYNAMIC_TYPE_ECC);
         if (priv == NULL)
             err = MEMORY_E;
@@ -19031,50 +26412,50 @@ int sp_ecc_check_key_256(const mp_int* pX, const mp_int* pY,
     if (err == MP_OKAY) {
         p = pub + 1;
 
-        sp_256_from_mp(pub->x, 10, pX);
-        sp_256_from_mp(pub->y, 10, pY);
-        sp_256_from_bin(pub->z, 10, one, (int)sizeof(one));
+        sp_256_from_mp(pub->x, 9, pX);
+        sp_256_from_mp(pub->y, 9, pY);
+        sp_256_from_bin(pub->z, 9, one, (int)sizeof(one));
         if (privm)
-            sp_256_from_mp(priv, 10, privm);
+            sp_256_from_mp(priv, 9, privm);
 
         /* Check point at infinitiy. */
-        if ((sp_256_iszero_10(pub->x) != 0) &&
-            (sp_256_iszero_10(pub->y) != 0)) {
+        if ((sp_256_iszero_9(pub->x) != 0) &&
+            (sp_256_iszero_9(pub->y) != 0)) {
             err = ECC_INF_E;
         }
     }
 
     /* Check range of X and Y */
     if ((err == MP_OKAY) &&
-            ((sp_256_cmp_10(pub->x, p256_mod) >= 0) ||
-             (sp_256_cmp_10(pub->y, p256_mod) >= 0))) {
+            ((sp_256_cmp_9(pub->x, p256_mod) >= 0) ||
+             (sp_256_cmp_9(pub->y, p256_mod) >= 0))) {
         err = ECC_OUT_OF_RANGE_E;
     }
 
     if (err == MP_OKAY) {
         /* Check point is on curve */
-        err = sp_256_ecc_is_point_10(pub, heap);
+        err = sp_256_ecc_is_point_9(pub, heap);
     }
 
     if (err == MP_OKAY) {
         /* Point * order = infinity */
-            err = sp_256_ecc_mulmod_10(p, pub, p256_order, 1, 1, heap);
+            err = sp_256_ecc_mulmod_9(p, pub, p256_order, 1, 1, heap);
     }
     /* Check result is infinity */
-    if ((err == MP_OKAY) && ((sp_256_iszero_10(p->x) == 0) ||
-                             (sp_256_iszero_10(p->y) == 0))) {
+    if ((err == MP_OKAY) && ((sp_256_iszero_9(p->x) == 0) ||
+                             (sp_256_iszero_9(p->y) == 0))) {
         err = ECC_INF_E;
     }
 
     if (privm) {
         if (err == MP_OKAY) {
             /* Base * private = point */
-                err = sp_256_ecc_mulmod_base_10(p, priv, 1, 1, heap);
+                err = sp_256_ecc_mulmod_base_9(p, priv, 1, 1, heap);
         }
         /* Check result is public key */
         if ((err == MP_OKAY) &&
-                ((sp_256_cmp_10(p->x, pub->x) != 0) ||
-                 (sp_256_cmp_10(p->y, pub->y) != 0))) {
+                ((sp_256_cmp_9(p->x, pub->x) != 0) ||
+                 (sp_256_cmp_9(p->y, pub->y) != 0))) {
             err = ECC_PRIV_KEY_E;
         }
     }
@@ -19112,7 +26493,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ,
     sp_digit* tmp = NULL;
     sp_point_256* p = NULL;
 #else
-    sp_digit tmp[2 * 10 * 5];
+    sp_digit tmp[2 * 9 * 5];
     sp_point_256 p[2];
 #endif
     sp_point_256* q = NULL;
@@ -19126,7 +26507,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ,
             err = MEMORY_E;
     }
     if (err == MP_OKAY) {
-        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 10 * 5, NULL,
+        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, NULL,
                                  DYNAMIC_TYPE_ECC);
         if (tmp == NULL) {
             err = MEMORY_E;
@@ -19137,18 +26518,18 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ,
     if (err == MP_OKAY) {
         q = p + 1;
 
-        sp_256_from_mp(p->x, 10, pX);
-        sp_256_from_mp(p->y, 10, pY);
-        sp_256_from_mp(p->z, 10, pZ);
-        sp_256_from_mp(q->x, 10, qX);
-        sp_256_from_mp(q->y, 10, qY);
-        sp_256_from_mp(q->z, 10, qZ);
-        p->infinity = sp_256_iszero_10(p->x) &
-                      sp_256_iszero_10(p->y);
-        q->infinity = sp_256_iszero_10(q->x) &
-                      sp_256_iszero_10(q->y);
+        sp_256_from_mp(p->x, 9, pX);
+        sp_256_from_mp(p->y, 9, pY);
+        sp_256_from_mp(p->z, 9, pZ);
+        sp_256_from_mp(q->x, 9, qX);
+        sp_256_from_mp(q->y, 9, qY);
+        sp_256_from_mp(q->z, 9, qZ);
+        p->infinity = sp_256_iszero_9(p->x) &
+                      sp_256_iszero_9(p->y);
+        q->infinity = sp_256_iszero_9(q->x) &
+                      sp_256_iszero_9(q->y);
 
-            sp_256_proj_point_add_10(p, p, q, tmp);
+            sp_256_proj_point_add_9(p, p, q, tmp);
     }
 
     if (err == MP_OKAY) {
@@ -19189,7 +26570,7 @@ int sp_ecc_proj_dbl_point_256(mp_int* pX, mp_int* pY, mp_int* pZ,
     sp_digit* tmp = NULL;
     sp_point_256* p = NULL;
 #else
-    sp_digit tmp[2 * 10 * 2];
+    sp_digit tmp[2 * 9 * 2];
     sp_point_256 p[1];
 #endif
     int err = MP_OKAY;
@@ -19202,7 +26583,7 @@ int sp_ecc_proj_dbl_point_256(mp_int* pX, mp_int* pY, mp_int* pZ,
             err = MEMORY_E;
     }
     if (err == MP_OKAY) {
-        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 10 * 2, NULL,
+        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 2, NULL,
                                  DYNAMIC_TYPE_ECC);
         if (tmp == NULL)
             err = MEMORY_E;
@@ -19210,13 +26591,13 @@ int sp_ecc_proj_dbl_point_256(mp_int* pX, mp_int* pY, mp_int* pZ,
 #endif
 
     if (err == MP_OKAY) {
-        sp_256_from_mp(p->x, 10, pX);
-        sp_256_from_mp(p->y, 10, pY);
-        sp_256_from_mp(p->z, 10, pZ);
-        p->infinity = sp_256_iszero_10(p->x) &
-                      sp_256_iszero_10(p->y);
+        sp_256_from_mp(p->x, 9, pX);
+        sp_256_from_mp(p->y, 9, pY);
+        sp_256_from_mp(p->z, 9, pZ);
+        p->infinity = sp_256_iszero_9(p->x) &
+                      sp_256_iszero_9(p->y);
 
-            sp_256_proj_point_dbl_10(p, p, tmp);
+            sp_256_proj_point_dbl_9(p, p, tmp);
     }
 
     if (err == MP_OKAY) {
@@ -19253,7 +26634,7 @@ int sp_ecc_map_256(mp_int* pX, mp_int* pY, mp_int* pZ)
     sp_digit* tmp = NULL;
     sp_point_256* p = NULL;
 #else
-    sp_digit tmp[2 * 10 * 4];
+    sp_digit tmp[2 * 9 * 4];
     sp_point_256 p[1];
 #endif
     int err = MP_OKAY;
@@ -19267,20 +26648,20 @@ int sp_ecc_map_256(mp_int* pX, mp_int* pY, mp_int* pZ)
             err = MEMORY_E;
     }
     if (err == MP_OKAY) {
-        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 10 * 4, NULL,
+        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 4, NULL,
                                  DYNAMIC_TYPE_ECC);
         if (tmp == NULL)
             err = MEMORY_E;
     }
 #endif
     if (err == MP_OKAY) {
-        sp_256_from_mp(p->x, 10, pX);
-        sp_256_from_mp(p->y, 10, pY);
-        sp_256_from_mp(p->z, 10, pZ);
-        p->infinity = sp_256_iszero_10(p->x) &
-                      sp_256_iszero_10(p->y);
+        sp_256_from_mp(p->x, 9, pX);
+        sp_256_from_mp(p->y, 9, pY);
+        sp_256_from_mp(p->z, 9, pZ);
+        p->infinity = sp_256_iszero_9(p->x) &
+                      sp_256_iszero_9(p->y);
 
-            sp_256_map_10(p, p, tmp);
+            sp_256_map_9(p, p, tmp);
     }
 
     if (err == MP_OKAY) {
@@ -19309,56 +26690,56 @@ int sp_ecc_map_256(mp_int* pX, mp_int* pY, mp_int* pZ)
  * y  The number to operate on and the result.
  * returns MEMORY_E if dynamic memory allocation fails and MP_OKAY otherwise.
  */
-static int sp_256_mont_sqrt_10(sp_digit* y)
+static int sp_256_mont_sqrt_9(sp_digit* y)
 {
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[4 * 10];
+    sp_digit t1[4 * 9];
 #endif
     sp_digit* t2 = NULL;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 4 * 10, NULL, DYNAMIC_TYPE_ECC);
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 4 * 9, NULL, DYNAMIC_TYPE_ECC);
     if (t1 == NULL) {
         err = MEMORY_E;
     }
 #endif
 
     if (err == MP_OKAY) {
-        t2 = t1 + 2 * 10;
+        t2 = t1 + 2 * 9;
 
         {
             /* t2 = y ^ 0x2 */
-            sp_256_mont_sqr_10(t2, y, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0x3 */
-            sp_256_mont_mul_10(t1, t2, y, p256_mod, p256_mp_mod);
+            sp_256_mont_mul_9(t1, t2, y, p256_mod, p256_mp_mod);
             /* t2 = y ^ 0xc */
-            sp_256_mont_sqr_n_10(t2, t1, 2, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_n_9(t2, t1, 2, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0xf */
-            sp_256_mont_mul_10(t1, t1, t2, p256_mod, p256_mp_mod);
+            sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
             /* t2 = y ^ 0xf0 */
-            sp_256_mont_sqr_n_10(t2, t1, 4, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_n_9(t2, t1, 4, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0xff */
-            sp_256_mont_mul_10(t1, t1, t2, p256_mod, p256_mp_mod);
+            sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
             /* t2 = y ^ 0xff00 */
-            sp_256_mont_sqr_n_10(t2, t1, 8, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_n_9(t2, t1, 8, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0xffff */
-            sp_256_mont_mul_10(t1, t1, t2, p256_mod, p256_mp_mod);
+            sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
             /* t2 = y ^ 0xffff0000 */
-            sp_256_mont_sqr_n_10(t2, t1, 16, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_n_9(t2, t1, 16, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0xffffffff */
-            sp_256_mont_mul_10(t1, t1, t2, p256_mod, p256_mp_mod);
+            sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0xffffffff00000000 */
-            sp_256_mont_sqr_n_10(t1, t1, 32, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_n_9(t1, t1, 32, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0xffffffff00000001 */
-            sp_256_mont_mul_10(t1, t1, y, p256_mod, p256_mp_mod);
+            sp_256_mont_mul_9(t1, t1, y, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0xffffffff00000001000000000000000000000000 */
-            sp_256_mont_sqr_n_10(t1, t1, 96, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_n_9(t1, t1, 96, p256_mod, p256_mp_mod);
             /* t1 = y ^ 0xffffffff00000001000000000000000000000001 */
-            sp_256_mont_mul_10(t1, t1, y, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_n_10(y, t1, 94, p256_mod, p256_mp_mod);
+            sp_256_mont_mul_9(t1, t1, y, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_n_9(y, t1, 94, p256_mod, p256_mp_mod);
         }
     }
 
@@ -19383,46 +26764,46 @@ int sp_ecc_uncompress_256(mp_int* xm, int odd, mp_int* ym)
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* x = NULL;
 #else
-    sp_digit x[4 * 10];
+    sp_digit x[4 * 9];
 #endif
     sp_digit* y = NULL;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    x = (sp_digit*)XMALLOC(sizeof(sp_digit) * 4 * 10, NULL, DYNAMIC_TYPE_ECC);
+    x = (sp_digit*)XMALLOC(sizeof(sp_digit) * 4 * 9, NULL, DYNAMIC_TYPE_ECC);
     if (x == NULL)
         err = MEMORY_E;
 #endif
 
     if (err == MP_OKAY) {
-        y = x + 2 * 10;
+        y = x + 2 * 9;
 
-        sp_256_from_mp(x, 10, xm);
-        err = sp_256_mod_mul_norm_10(x, x, p256_mod);
+        sp_256_from_mp(x, 9, xm);
+        err = sp_256_mod_mul_norm_9(x, x, p256_mod);
     }
     if (err == MP_OKAY) {
         /* y = x^3 */
         {
-            sp_256_mont_sqr_10(y, x, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_10(y, y, x, p256_mod, p256_mp_mod);
+            sp_256_mont_sqr_9(y, x, p256_mod, p256_mp_mod);
+            sp_256_mont_mul_9(y, y, x, p256_mod, p256_mp_mod);
         }
         /* y = x^3 - 3x */
-        sp_256_mont_sub_10(y, y, x, p256_mod);
-        sp_256_mont_sub_10(y, y, x, p256_mod);
-        sp_256_mont_sub_10(y, y, x, p256_mod);
+        sp_256_mont_sub_9(y, y, x, p256_mod);
+        sp_256_mont_sub_9(y, y, x, p256_mod);
+        sp_256_mont_sub_9(y, y, x, p256_mod);
         /* y = x^3 - 3x + b */
-        err = sp_256_mod_mul_norm_10(x, p256_b, p256_mod);
+        err = sp_256_mod_mul_norm_9(x, p256_b, p256_mod);
     }
     if (err == MP_OKAY) {
-        sp_256_mont_add_10(y, y, x, p256_mod);
+        sp_256_mont_add_9(y, y, x, p256_mod);
         /* y = sqrt(x^3 - 3x + b) */
-        err = sp_256_mont_sqrt_10(y);
+        err = sp_256_mont_sqrt_9(y);
     }
     if (err == MP_OKAY) {
-        XMEMSET(y + 10, 0, 10U * sizeof(sp_digit));
-        sp_256_mont_reduce_10(y, p256_mod, p256_mp_mod);
+        XMEMSET(y + 9, 0, 9U * sizeof(sp_digit));
+        sp_256_mont_reduce_9(y, p256_mod, p256_mp_mod);
         if ((((word32)y[0] ^ (word32)odd) & 1U) != 0U) {
-            sp_256_mont_sub_10(y, p256_mod, y, p256_mod);
+            sp_256_mont_sub_9(y, p256_mod, y, p256_mod);
         }
 
         err = sp_256_to_mp(y, ym);
@@ -19543,30 +26924,33 @@ SP_NOINLINE static void sp_384_mul_15(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 lo;
 
-    c = ((int64_t)a[14]) * b[14];
+    c = ((sp_uint64)a[14]) * b[14];
     r[29] = (sp_digit)(c >> 26);
-    c = (c & 0x3ffffff) << 26;
+    c &= 0x3ffffff;
     for (k = 27; k >= 0; k--) {
-        for (i = 14; i >= 0; i--) {
-            j = k - i;
-            if (j >= 15) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
+        if (k >= 15) {
+            i = k - 14;
+            imax = 14;
         }
-        r[k + 2] += (sp_digit)(c >> 52);
-        r[k + 1] = (sp_digit)((c >> 26) & 0x3ffffff);
-        c = (c & 0x3ffffff) << 26;
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint64)a[i]) * b[k - i];
+        }
+        c += lo >> 26;
+        r[k + 2] += (sp_digit)(c >> 26);
+        r[k + 1]  = (sp_digit)(c & 0x3ffffff);
+        c = lo & 0x3ffffff;
     }
-    r[0] = (sp_digit)(c >> 26);
+    r[0] = (sp_digit)c;
 }
 
 #else
@@ -19579,231 +26963,231 @@ SP_NOINLINE static void sp_384_mul_15(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_384_mul_15(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int64_t t0   = ((int64_t)a[ 0]) * b[ 0];
-    int64_t t1   = ((int64_t)a[ 0]) * b[ 1]
-                 + ((int64_t)a[ 1]) * b[ 0];
-    int64_t t2   = ((int64_t)a[ 0]) * b[ 2]
-                 + ((int64_t)a[ 1]) * b[ 1]
-                 + ((int64_t)a[ 2]) * b[ 0];
-    int64_t t3   = ((int64_t)a[ 0]) * b[ 3]
-                 + ((int64_t)a[ 1]) * b[ 2]
-                 + ((int64_t)a[ 2]) * b[ 1]
-                 + ((int64_t)a[ 3]) * b[ 0];
-    int64_t t4   = ((int64_t)a[ 0]) * b[ 4]
-                 + ((int64_t)a[ 1]) * b[ 3]
-                 + ((int64_t)a[ 2]) * b[ 2]
-                 + ((int64_t)a[ 3]) * b[ 1]
-                 + ((int64_t)a[ 4]) * b[ 0];
-    int64_t t5   = ((int64_t)a[ 0]) * b[ 5]
-                 + ((int64_t)a[ 1]) * b[ 4]
-                 + ((int64_t)a[ 2]) * b[ 3]
-                 + ((int64_t)a[ 3]) * b[ 2]
-                 + ((int64_t)a[ 4]) * b[ 1]
-                 + ((int64_t)a[ 5]) * b[ 0];
-    int64_t t6   = ((int64_t)a[ 0]) * b[ 6]
-                 + ((int64_t)a[ 1]) * b[ 5]
-                 + ((int64_t)a[ 2]) * b[ 4]
-                 + ((int64_t)a[ 3]) * b[ 3]
-                 + ((int64_t)a[ 4]) * b[ 2]
-                 + ((int64_t)a[ 5]) * b[ 1]
-                 + ((int64_t)a[ 6]) * b[ 0];
-    int64_t t7   = ((int64_t)a[ 0]) * b[ 7]
-                 + ((int64_t)a[ 1]) * b[ 6]
-                 + ((int64_t)a[ 2]) * b[ 5]
-                 + ((int64_t)a[ 3]) * b[ 4]
-                 + ((int64_t)a[ 4]) * b[ 3]
-                 + ((int64_t)a[ 5]) * b[ 2]
-                 + ((int64_t)a[ 6]) * b[ 1]
-                 + ((int64_t)a[ 7]) * b[ 0];
-    int64_t t8   = ((int64_t)a[ 0]) * b[ 8]
-                 + ((int64_t)a[ 1]) * b[ 7]
-                 + ((int64_t)a[ 2]) * b[ 6]
-                 + ((int64_t)a[ 3]) * b[ 5]
-                 + ((int64_t)a[ 4]) * b[ 4]
-                 + ((int64_t)a[ 5]) * b[ 3]
-                 + ((int64_t)a[ 6]) * b[ 2]
-                 + ((int64_t)a[ 7]) * b[ 1]
-                 + ((int64_t)a[ 8]) * b[ 0];
-    int64_t t9   = ((int64_t)a[ 0]) * b[ 9]
-                 + ((int64_t)a[ 1]) * b[ 8]
-                 + ((int64_t)a[ 2]) * b[ 7]
-                 + ((int64_t)a[ 3]) * b[ 6]
-                 + ((int64_t)a[ 4]) * b[ 5]
-                 + ((int64_t)a[ 5]) * b[ 4]
-                 + ((int64_t)a[ 6]) * b[ 3]
-                 + ((int64_t)a[ 7]) * b[ 2]
-                 + ((int64_t)a[ 8]) * b[ 1]
-                 + ((int64_t)a[ 9]) * b[ 0];
-    int64_t t10  = ((int64_t)a[ 0]) * b[10]
-                 + ((int64_t)a[ 1]) * b[ 9]
-                 + ((int64_t)a[ 2]) * b[ 8]
-                 + ((int64_t)a[ 3]) * b[ 7]
-                 + ((int64_t)a[ 4]) * b[ 6]
-                 + ((int64_t)a[ 5]) * b[ 5]
-                 + ((int64_t)a[ 6]) * b[ 4]
-                 + ((int64_t)a[ 7]) * b[ 3]
-                 + ((int64_t)a[ 8]) * b[ 2]
-                 + ((int64_t)a[ 9]) * b[ 1]
-                 + ((int64_t)a[10]) * b[ 0];
-    int64_t t11  = ((int64_t)a[ 0]) * b[11]
-                 + ((int64_t)a[ 1]) * b[10]
-                 + ((int64_t)a[ 2]) * b[ 9]
-                 + ((int64_t)a[ 3]) * b[ 8]
-                 + ((int64_t)a[ 4]) * b[ 7]
-                 + ((int64_t)a[ 5]) * b[ 6]
-                 + ((int64_t)a[ 6]) * b[ 5]
-                 + ((int64_t)a[ 7]) * b[ 4]
-                 + ((int64_t)a[ 8]) * b[ 3]
-                 + ((int64_t)a[ 9]) * b[ 2]
-                 + ((int64_t)a[10]) * b[ 1]
-                 + ((int64_t)a[11]) * b[ 0];
-    int64_t t12  = ((int64_t)a[ 0]) * b[12]
-                 + ((int64_t)a[ 1]) * b[11]
-                 + ((int64_t)a[ 2]) * b[10]
-                 + ((int64_t)a[ 3]) * b[ 9]
-                 + ((int64_t)a[ 4]) * b[ 8]
-                 + ((int64_t)a[ 5]) * b[ 7]
-                 + ((int64_t)a[ 6]) * b[ 6]
-                 + ((int64_t)a[ 7]) * b[ 5]
-                 + ((int64_t)a[ 8]) * b[ 4]
-                 + ((int64_t)a[ 9]) * b[ 3]
-                 + ((int64_t)a[10]) * b[ 2]
-                 + ((int64_t)a[11]) * b[ 1]
-                 + ((int64_t)a[12]) * b[ 0];
-    int64_t t13  = ((int64_t)a[ 0]) * b[13]
-                 + ((int64_t)a[ 1]) * b[12]
-                 + ((int64_t)a[ 2]) * b[11]
-                 + ((int64_t)a[ 3]) * b[10]
-                 + ((int64_t)a[ 4]) * b[ 9]
-                 + ((int64_t)a[ 5]) * b[ 8]
-                 + ((int64_t)a[ 6]) * b[ 7]
-                 + ((int64_t)a[ 7]) * b[ 6]
-                 + ((int64_t)a[ 8]) * b[ 5]
-                 + ((int64_t)a[ 9]) * b[ 4]
-                 + ((int64_t)a[10]) * b[ 3]
-                 + ((int64_t)a[11]) * b[ 2]
-                 + ((int64_t)a[12]) * b[ 1]
-                 + ((int64_t)a[13]) * b[ 0];
-    int64_t t14  = ((int64_t)a[ 0]) * b[14]
-                 + ((int64_t)a[ 1]) * b[13]
-                 + ((int64_t)a[ 2]) * b[12]
-                 + ((int64_t)a[ 3]) * b[11]
-                 + ((int64_t)a[ 4]) * b[10]
-                 + ((int64_t)a[ 5]) * b[ 9]
-                 + ((int64_t)a[ 6]) * b[ 8]
-                 + ((int64_t)a[ 7]) * b[ 7]
-                 + ((int64_t)a[ 8]) * b[ 6]
-                 + ((int64_t)a[ 9]) * b[ 5]
-                 + ((int64_t)a[10]) * b[ 4]
-                 + ((int64_t)a[11]) * b[ 3]
-                 + ((int64_t)a[12]) * b[ 2]
-                 + ((int64_t)a[13]) * b[ 1]
-                 + ((int64_t)a[14]) * b[ 0];
-    int64_t t15  = ((int64_t)a[ 1]) * b[14]
-                 + ((int64_t)a[ 2]) * b[13]
-                 + ((int64_t)a[ 3]) * b[12]
-                 + ((int64_t)a[ 4]) * b[11]
-                 + ((int64_t)a[ 5]) * b[10]
-                 + ((int64_t)a[ 6]) * b[ 9]
-                 + ((int64_t)a[ 7]) * b[ 8]
-                 + ((int64_t)a[ 8]) * b[ 7]
-                 + ((int64_t)a[ 9]) * b[ 6]
-                 + ((int64_t)a[10]) * b[ 5]
-                 + ((int64_t)a[11]) * b[ 4]
-                 + ((int64_t)a[12]) * b[ 3]
-                 + ((int64_t)a[13]) * b[ 2]
-                 + ((int64_t)a[14]) * b[ 1];
-    int64_t t16  = ((int64_t)a[ 2]) * b[14]
-                 + ((int64_t)a[ 3]) * b[13]
-                 + ((int64_t)a[ 4]) * b[12]
-                 + ((int64_t)a[ 5]) * b[11]
-                 + ((int64_t)a[ 6]) * b[10]
-                 + ((int64_t)a[ 7]) * b[ 9]
-                 + ((int64_t)a[ 8]) * b[ 8]
-                 + ((int64_t)a[ 9]) * b[ 7]
-                 + ((int64_t)a[10]) * b[ 6]
-                 + ((int64_t)a[11]) * b[ 5]
-                 + ((int64_t)a[12]) * b[ 4]
-                 + ((int64_t)a[13]) * b[ 3]
-                 + ((int64_t)a[14]) * b[ 2];
-    int64_t t17  = ((int64_t)a[ 3]) * b[14]
-                 + ((int64_t)a[ 4]) * b[13]
-                 + ((int64_t)a[ 5]) * b[12]
-                 + ((int64_t)a[ 6]) * b[11]
-                 + ((int64_t)a[ 7]) * b[10]
-                 + ((int64_t)a[ 8]) * b[ 9]
-                 + ((int64_t)a[ 9]) * b[ 8]
-                 + ((int64_t)a[10]) * b[ 7]
-                 + ((int64_t)a[11]) * b[ 6]
-                 + ((int64_t)a[12]) * b[ 5]
-                 + ((int64_t)a[13]) * b[ 4]
-                 + ((int64_t)a[14]) * b[ 3];
-    int64_t t18  = ((int64_t)a[ 4]) * b[14]
-                 + ((int64_t)a[ 5]) * b[13]
-                 + ((int64_t)a[ 6]) * b[12]
-                 + ((int64_t)a[ 7]) * b[11]
-                 + ((int64_t)a[ 8]) * b[10]
-                 + ((int64_t)a[ 9]) * b[ 9]
-                 + ((int64_t)a[10]) * b[ 8]
-                 + ((int64_t)a[11]) * b[ 7]
-                 + ((int64_t)a[12]) * b[ 6]
-                 + ((int64_t)a[13]) * b[ 5]
-                 + ((int64_t)a[14]) * b[ 4];
-    int64_t t19  = ((int64_t)a[ 5]) * b[14]
-                 + ((int64_t)a[ 6]) * b[13]
-                 + ((int64_t)a[ 7]) * b[12]
-                 + ((int64_t)a[ 8]) * b[11]
-                 + ((int64_t)a[ 9]) * b[10]
-                 + ((int64_t)a[10]) * b[ 9]
-                 + ((int64_t)a[11]) * b[ 8]
-                 + ((int64_t)a[12]) * b[ 7]
-                 + ((int64_t)a[13]) * b[ 6]
-                 + ((int64_t)a[14]) * b[ 5];
-    int64_t t20  = ((int64_t)a[ 6]) * b[14]
-                 + ((int64_t)a[ 7]) * b[13]
-                 + ((int64_t)a[ 8]) * b[12]
-                 + ((int64_t)a[ 9]) * b[11]
-                 + ((int64_t)a[10]) * b[10]
-                 + ((int64_t)a[11]) * b[ 9]
-                 + ((int64_t)a[12]) * b[ 8]
-                 + ((int64_t)a[13]) * b[ 7]
-                 + ((int64_t)a[14]) * b[ 6];
-    int64_t t21  = ((int64_t)a[ 7]) * b[14]
-                 + ((int64_t)a[ 8]) * b[13]
-                 + ((int64_t)a[ 9]) * b[12]
-                 + ((int64_t)a[10]) * b[11]
-                 + ((int64_t)a[11]) * b[10]
-                 + ((int64_t)a[12]) * b[ 9]
-                 + ((int64_t)a[13]) * b[ 8]
-                 + ((int64_t)a[14]) * b[ 7];
-    int64_t t22  = ((int64_t)a[ 8]) * b[14]
-                 + ((int64_t)a[ 9]) * b[13]
-                 + ((int64_t)a[10]) * b[12]
-                 + ((int64_t)a[11]) * b[11]
-                 + ((int64_t)a[12]) * b[10]
-                 + ((int64_t)a[13]) * b[ 9]
-                 + ((int64_t)a[14]) * b[ 8];
-    int64_t t23  = ((int64_t)a[ 9]) * b[14]
-                 + ((int64_t)a[10]) * b[13]
-                 + ((int64_t)a[11]) * b[12]
-                 + ((int64_t)a[12]) * b[11]
-                 + ((int64_t)a[13]) * b[10]
-                 + ((int64_t)a[14]) * b[ 9];
-    int64_t t24  = ((int64_t)a[10]) * b[14]
-                 + ((int64_t)a[11]) * b[13]
-                 + ((int64_t)a[12]) * b[12]
-                 + ((int64_t)a[13]) * b[11]
-                 + ((int64_t)a[14]) * b[10];
-    int64_t t25  = ((int64_t)a[11]) * b[14]
-                 + ((int64_t)a[12]) * b[13]
-                 + ((int64_t)a[13]) * b[12]
-                 + ((int64_t)a[14]) * b[11];
-    int64_t t26  = ((int64_t)a[12]) * b[14]
-                 + ((int64_t)a[13]) * b[13]
-                 + ((int64_t)a[14]) * b[12];
-    int64_t t27  = ((int64_t)a[13]) * b[14]
-                 + ((int64_t)a[14]) * b[13];
-    int64_t t28  = ((int64_t)a[14]) * b[14];
+    sp_int64 t0   = ((sp_int64)a[ 0]) * b[ 0];
+    sp_int64 t1   = ((sp_int64)a[ 0]) * b[ 1]
+                 + ((sp_int64)a[ 1]) * b[ 0];
+    sp_int64 t2   = ((sp_int64)a[ 0]) * b[ 2]
+                 + ((sp_int64)a[ 1]) * b[ 1]
+                 + ((sp_int64)a[ 2]) * b[ 0];
+    sp_int64 t3   = ((sp_int64)a[ 0]) * b[ 3]
+                 + ((sp_int64)a[ 1]) * b[ 2]
+                 + ((sp_int64)a[ 2]) * b[ 1]
+                 + ((sp_int64)a[ 3]) * b[ 0];
+    sp_int64 t4   = ((sp_int64)a[ 0]) * b[ 4]
+                 + ((sp_int64)a[ 1]) * b[ 3]
+                 + ((sp_int64)a[ 2]) * b[ 2]
+                 + ((sp_int64)a[ 3]) * b[ 1]
+                 + ((sp_int64)a[ 4]) * b[ 0];
+    sp_int64 t5   = ((sp_int64)a[ 0]) * b[ 5]
+                 + ((sp_int64)a[ 1]) * b[ 4]
+                 + ((sp_int64)a[ 2]) * b[ 3]
+                 + ((sp_int64)a[ 3]) * b[ 2]
+                 + ((sp_int64)a[ 4]) * b[ 1]
+                 + ((sp_int64)a[ 5]) * b[ 0];
+    sp_int64 t6   = ((sp_int64)a[ 0]) * b[ 6]
+                 + ((sp_int64)a[ 1]) * b[ 5]
+                 + ((sp_int64)a[ 2]) * b[ 4]
+                 + ((sp_int64)a[ 3]) * b[ 3]
+                 + ((sp_int64)a[ 4]) * b[ 2]
+                 + ((sp_int64)a[ 5]) * b[ 1]
+                 + ((sp_int64)a[ 6]) * b[ 0];
+    sp_int64 t7   = ((sp_int64)a[ 0]) * b[ 7]
+                 + ((sp_int64)a[ 1]) * b[ 6]
+                 + ((sp_int64)a[ 2]) * b[ 5]
+                 + ((sp_int64)a[ 3]) * b[ 4]
+                 + ((sp_int64)a[ 4]) * b[ 3]
+                 + ((sp_int64)a[ 5]) * b[ 2]
+                 + ((sp_int64)a[ 6]) * b[ 1]
+                 + ((sp_int64)a[ 7]) * b[ 0];
+    sp_int64 t8   = ((sp_int64)a[ 0]) * b[ 8]
+                 + ((sp_int64)a[ 1]) * b[ 7]
+                 + ((sp_int64)a[ 2]) * b[ 6]
+                 + ((sp_int64)a[ 3]) * b[ 5]
+                 + ((sp_int64)a[ 4]) * b[ 4]
+                 + ((sp_int64)a[ 5]) * b[ 3]
+                 + ((sp_int64)a[ 6]) * b[ 2]
+                 + ((sp_int64)a[ 7]) * b[ 1]
+                 + ((sp_int64)a[ 8]) * b[ 0];
+    sp_int64 t9   = ((sp_int64)a[ 0]) * b[ 9]
+                 + ((sp_int64)a[ 1]) * b[ 8]
+                 + ((sp_int64)a[ 2]) * b[ 7]
+                 + ((sp_int64)a[ 3]) * b[ 6]
+                 + ((sp_int64)a[ 4]) * b[ 5]
+                 + ((sp_int64)a[ 5]) * b[ 4]
+                 + ((sp_int64)a[ 6]) * b[ 3]
+                 + ((sp_int64)a[ 7]) * b[ 2]
+                 + ((sp_int64)a[ 8]) * b[ 1]
+                 + ((sp_int64)a[ 9]) * b[ 0];
+    sp_int64 t10  = ((sp_int64)a[ 0]) * b[10]
+                 + ((sp_int64)a[ 1]) * b[ 9]
+                 + ((sp_int64)a[ 2]) * b[ 8]
+                 + ((sp_int64)a[ 3]) * b[ 7]
+                 + ((sp_int64)a[ 4]) * b[ 6]
+                 + ((sp_int64)a[ 5]) * b[ 5]
+                 + ((sp_int64)a[ 6]) * b[ 4]
+                 + ((sp_int64)a[ 7]) * b[ 3]
+                 + ((sp_int64)a[ 8]) * b[ 2]
+                 + ((sp_int64)a[ 9]) * b[ 1]
+                 + ((sp_int64)a[10]) * b[ 0];
+    sp_int64 t11  = ((sp_int64)a[ 0]) * b[11]
+                 + ((sp_int64)a[ 1]) * b[10]
+                 + ((sp_int64)a[ 2]) * b[ 9]
+                 + ((sp_int64)a[ 3]) * b[ 8]
+                 + ((sp_int64)a[ 4]) * b[ 7]
+                 + ((sp_int64)a[ 5]) * b[ 6]
+                 + ((sp_int64)a[ 6]) * b[ 5]
+                 + ((sp_int64)a[ 7]) * b[ 4]
+                 + ((sp_int64)a[ 8]) * b[ 3]
+                 + ((sp_int64)a[ 9]) * b[ 2]
+                 + ((sp_int64)a[10]) * b[ 1]
+                 + ((sp_int64)a[11]) * b[ 0];
+    sp_int64 t12  = ((sp_int64)a[ 0]) * b[12]
+                 + ((sp_int64)a[ 1]) * b[11]
+                 + ((sp_int64)a[ 2]) * b[10]
+                 + ((sp_int64)a[ 3]) * b[ 9]
+                 + ((sp_int64)a[ 4]) * b[ 8]
+                 + ((sp_int64)a[ 5]) * b[ 7]
+                 + ((sp_int64)a[ 6]) * b[ 6]
+                 + ((sp_int64)a[ 7]) * b[ 5]
+                 + ((sp_int64)a[ 8]) * b[ 4]
+                 + ((sp_int64)a[ 9]) * b[ 3]
+                 + ((sp_int64)a[10]) * b[ 2]
+                 + ((sp_int64)a[11]) * b[ 1]
+                 + ((sp_int64)a[12]) * b[ 0];
+    sp_int64 t13  = ((sp_int64)a[ 0]) * b[13]
+                 + ((sp_int64)a[ 1]) * b[12]
+                 + ((sp_int64)a[ 2]) * b[11]
+                 + ((sp_int64)a[ 3]) * b[10]
+                 + ((sp_int64)a[ 4]) * b[ 9]
+                 + ((sp_int64)a[ 5]) * b[ 8]
+                 + ((sp_int64)a[ 6]) * b[ 7]
+                 + ((sp_int64)a[ 7]) * b[ 6]
+                 + ((sp_int64)a[ 8]) * b[ 5]
+                 + ((sp_int64)a[ 9]) * b[ 4]
+                 + ((sp_int64)a[10]) * b[ 3]
+                 + ((sp_int64)a[11]) * b[ 2]
+                 + ((sp_int64)a[12]) * b[ 1]
+                 + ((sp_int64)a[13]) * b[ 0];
+    sp_int64 t14  = ((sp_int64)a[ 0]) * b[14]
+                 + ((sp_int64)a[ 1]) * b[13]
+                 + ((sp_int64)a[ 2]) * b[12]
+                 + ((sp_int64)a[ 3]) * b[11]
+                 + ((sp_int64)a[ 4]) * b[10]
+                 + ((sp_int64)a[ 5]) * b[ 9]
+                 + ((sp_int64)a[ 6]) * b[ 8]
+                 + ((sp_int64)a[ 7]) * b[ 7]
+                 + ((sp_int64)a[ 8]) * b[ 6]
+                 + ((sp_int64)a[ 9]) * b[ 5]
+                 + ((sp_int64)a[10]) * b[ 4]
+                 + ((sp_int64)a[11]) * b[ 3]
+                 + ((sp_int64)a[12]) * b[ 2]
+                 + ((sp_int64)a[13]) * b[ 1]
+                 + ((sp_int64)a[14]) * b[ 0];
+    sp_int64 t15  = ((sp_int64)a[ 1]) * b[14]
+                 + ((sp_int64)a[ 2]) * b[13]
+                 + ((sp_int64)a[ 3]) * b[12]
+                 + ((sp_int64)a[ 4]) * b[11]
+                 + ((sp_int64)a[ 5]) * b[10]
+                 + ((sp_int64)a[ 6]) * b[ 9]
+                 + ((sp_int64)a[ 7]) * b[ 8]
+                 + ((sp_int64)a[ 8]) * b[ 7]
+                 + ((sp_int64)a[ 9]) * b[ 6]
+                 + ((sp_int64)a[10]) * b[ 5]
+                 + ((sp_int64)a[11]) * b[ 4]
+                 + ((sp_int64)a[12]) * b[ 3]
+                 + ((sp_int64)a[13]) * b[ 2]
+                 + ((sp_int64)a[14]) * b[ 1];
+    sp_int64 t16  = ((sp_int64)a[ 2]) * b[14]
+                 + ((sp_int64)a[ 3]) * b[13]
+                 + ((sp_int64)a[ 4]) * b[12]
+                 + ((sp_int64)a[ 5]) * b[11]
+                 + ((sp_int64)a[ 6]) * b[10]
+                 + ((sp_int64)a[ 7]) * b[ 9]
+                 + ((sp_int64)a[ 8]) * b[ 8]
+                 + ((sp_int64)a[ 9]) * b[ 7]
+                 + ((sp_int64)a[10]) * b[ 6]
+                 + ((sp_int64)a[11]) * b[ 5]
+                 + ((sp_int64)a[12]) * b[ 4]
+                 + ((sp_int64)a[13]) * b[ 3]
+                 + ((sp_int64)a[14]) * b[ 2];
+    sp_int64 t17  = ((sp_int64)a[ 3]) * b[14]
+                 + ((sp_int64)a[ 4]) * b[13]
+                 + ((sp_int64)a[ 5]) * b[12]
+                 + ((sp_int64)a[ 6]) * b[11]
+                 + ((sp_int64)a[ 7]) * b[10]
+                 + ((sp_int64)a[ 8]) * b[ 9]
+                 + ((sp_int64)a[ 9]) * b[ 8]
+                 + ((sp_int64)a[10]) * b[ 7]
+                 + ((sp_int64)a[11]) * b[ 6]
+                 + ((sp_int64)a[12]) * b[ 5]
+                 + ((sp_int64)a[13]) * b[ 4]
+                 + ((sp_int64)a[14]) * b[ 3];
+    sp_int64 t18  = ((sp_int64)a[ 4]) * b[14]
+                 + ((sp_int64)a[ 5]) * b[13]
+                 + ((sp_int64)a[ 6]) * b[12]
+                 + ((sp_int64)a[ 7]) * b[11]
+                 + ((sp_int64)a[ 8]) * b[10]
+                 + ((sp_int64)a[ 9]) * b[ 9]
+                 + ((sp_int64)a[10]) * b[ 8]
+                 + ((sp_int64)a[11]) * b[ 7]
+                 + ((sp_int64)a[12]) * b[ 6]
+                 + ((sp_int64)a[13]) * b[ 5]
+                 + ((sp_int64)a[14]) * b[ 4];
+    sp_int64 t19  = ((sp_int64)a[ 5]) * b[14]
+                 + ((sp_int64)a[ 6]) * b[13]
+                 + ((sp_int64)a[ 7]) * b[12]
+                 + ((sp_int64)a[ 8]) * b[11]
+                 + ((sp_int64)a[ 9]) * b[10]
+                 + ((sp_int64)a[10]) * b[ 9]
+                 + ((sp_int64)a[11]) * b[ 8]
+                 + ((sp_int64)a[12]) * b[ 7]
+                 + ((sp_int64)a[13]) * b[ 6]
+                 + ((sp_int64)a[14]) * b[ 5];
+    sp_int64 t20  = ((sp_int64)a[ 6]) * b[14]
+                 + ((sp_int64)a[ 7]) * b[13]
+                 + ((sp_int64)a[ 8]) * b[12]
+                 + ((sp_int64)a[ 9]) * b[11]
+                 + ((sp_int64)a[10]) * b[10]
+                 + ((sp_int64)a[11]) * b[ 9]
+                 + ((sp_int64)a[12]) * b[ 8]
+                 + ((sp_int64)a[13]) * b[ 7]
+                 + ((sp_int64)a[14]) * b[ 6];
+    sp_int64 t21  = ((sp_int64)a[ 7]) * b[14]
+                 + ((sp_int64)a[ 8]) * b[13]
+                 + ((sp_int64)a[ 9]) * b[12]
+                 + ((sp_int64)a[10]) * b[11]
+                 + ((sp_int64)a[11]) * b[10]
+                 + ((sp_int64)a[12]) * b[ 9]
+                 + ((sp_int64)a[13]) * b[ 8]
+                 + ((sp_int64)a[14]) * b[ 7];
+    sp_int64 t22  = ((sp_int64)a[ 8]) * b[14]
+                 + ((sp_int64)a[ 9]) * b[13]
+                 + ((sp_int64)a[10]) * b[12]
+                 + ((sp_int64)a[11]) * b[11]
+                 + ((sp_int64)a[12]) * b[10]
+                 + ((sp_int64)a[13]) * b[ 9]
+                 + ((sp_int64)a[14]) * b[ 8];
+    sp_int64 t23  = ((sp_int64)a[ 9]) * b[14]
+                 + ((sp_int64)a[10]) * b[13]
+                 + ((sp_int64)a[11]) * b[12]
+                 + ((sp_int64)a[12]) * b[11]
+                 + ((sp_int64)a[13]) * b[10]
+                 + ((sp_int64)a[14]) * b[ 9];
+    sp_int64 t24  = ((sp_int64)a[10]) * b[14]
+                 + ((sp_int64)a[11]) * b[13]
+                 + ((sp_int64)a[12]) * b[12]
+                 + ((sp_int64)a[13]) * b[11]
+                 + ((sp_int64)a[14]) * b[10];
+    sp_int64 t25  = ((sp_int64)a[11]) * b[14]
+                 + ((sp_int64)a[12]) * b[13]
+                 + ((sp_int64)a[13]) * b[12]
+                 + ((sp_int64)a[14]) * b[11];
+    sp_int64 t26  = ((sp_int64)a[12]) * b[14]
+                 + ((sp_int64)a[13]) * b[13]
+                 + ((sp_int64)a[14]) * b[12];
+    sp_int64 t27  = ((sp_int64)a[13]) * b[14]
+                 + ((sp_int64)a[14]) * b[13];
+    sp_int64 t28  = ((sp_int64)a[14]) * b[14];
 
     t1   += t0  >> 26; r[ 0] = t0  & 0x3ffffff;
     t2   += t1  >> 26; r[ 1] = t1  & 0x3ffffff;
@@ -19847,31 +27231,34 @@ SP_NOINLINE static void sp_384_mul_15(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_384_sqr_15(sp_digit* r, const sp_digit* a)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 t;
 
-    c = ((int64_t)a[14]) * a[14];
+    c = ((sp_uint64)a[14]) * a[14];
     r[29] = (sp_digit)(c >> 26);
     c = (c & 0x3ffffff) << 26;
     for (k = 27; k >= 0; k--) {
-        for (i = 14; i >= 0; i--) {
-            j = k - i;
-            if (j >= 15 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * a[j] * 2;
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
         }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
+        if (k < 14) {
+            imax = k;
         }
+        else {
+            imax = 14;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint64)a[i]) * a[k - i];
+        }
+        c += t * 2;
 
-        r[k + 2] += (sp_digit)(c >> 52);
-        r[k + 1] = (sp_digit)((c >> 26) & 0x3ffffff);
+        r[k + 2] += (sp_digit) (c >> 52);
+        r[k + 1]  = (sp_digit)((c >> 26) & 0x3ffffff);
         c = (c & 0x3ffffff) << 26;
     }
     r[0] = (sp_digit)(c >> 26);
@@ -19885,126 +27272,126 @@ SP_NOINLINE static void sp_384_sqr_15(sp_digit* r, const sp_digit* a)
  */
 SP_NOINLINE static void sp_384_sqr_15(sp_digit* r, const sp_digit* a)
 {
-    int64_t t0   =  ((int64_t)a[ 0]) * a[ 0];
-    int64_t t1   = (((int64_t)a[ 0]) * a[ 1]) * 2;
-    int64_t t2   = (((int64_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int64_t)a[ 1]) * a[ 1];
-    int64_t t3   = (((int64_t)a[ 0]) * a[ 3]
-                 +  ((int64_t)a[ 1]) * a[ 2]) * 2;
-    int64_t t4   = (((int64_t)a[ 0]) * a[ 4]
-                 +  ((int64_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int64_t)a[ 2]) * a[ 2];
-    int64_t t5   = (((int64_t)a[ 0]) * a[ 5]
-                 +  ((int64_t)a[ 1]) * a[ 4]
-                 +  ((int64_t)a[ 2]) * a[ 3]) * 2;
-    int64_t t6   = (((int64_t)a[ 0]) * a[ 6]
-                 +  ((int64_t)a[ 1]) * a[ 5]
-                 +  ((int64_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int64_t)a[ 3]) * a[ 3];
-    int64_t t7   = (((int64_t)a[ 0]) * a[ 7]
-                 +  ((int64_t)a[ 1]) * a[ 6]
-                 +  ((int64_t)a[ 2]) * a[ 5]
-                 +  ((int64_t)a[ 3]) * a[ 4]) * 2;
-    int64_t t8   = (((int64_t)a[ 0]) * a[ 8]
-                 +  ((int64_t)a[ 1]) * a[ 7]
-                 +  ((int64_t)a[ 2]) * a[ 6]
-                 +  ((int64_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int64_t)a[ 4]) * a[ 4];
-    int64_t t9   = (((int64_t)a[ 0]) * a[ 9]
-                 +  ((int64_t)a[ 1]) * a[ 8]
-                 +  ((int64_t)a[ 2]) * a[ 7]
-                 +  ((int64_t)a[ 3]) * a[ 6]
-                 +  ((int64_t)a[ 4]) * a[ 5]) * 2;
-    int64_t t10  = (((int64_t)a[ 0]) * a[10]
-                 +  ((int64_t)a[ 1]) * a[ 9]
-                 +  ((int64_t)a[ 2]) * a[ 8]
-                 +  ((int64_t)a[ 3]) * a[ 7]
-                 +  ((int64_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int64_t)a[ 5]) * a[ 5];
-    int64_t t11  = (((int64_t)a[ 0]) * a[11]
-                 +  ((int64_t)a[ 1]) * a[10]
-                 +  ((int64_t)a[ 2]) * a[ 9]
-                 +  ((int64_t)a[ 3]) * a[ 8]
-                 +  ((int64_t)a[ 4]) * a[ 7]
-                 +  ((int64_t)a[ 5]) * a[ 6]) * 2;
-    int64_t t12  = (((int64_t)a[ 0]) * a[12]
-                 +  ((int64_t)a[ 1]) * a[11]
-                 +  ((int64_t)a[ 2]) * a[10]
-                 +  ((int64_t)a[ 3]) * a[ 9]
-                 +  ((int64_t)a[ 4]) * a[ 8]
-                 +  ((int64_t)a[ 5]) * a[ 7]) * 2
-                 +  ((int64_t)a[ 6]) * a[ 6];
-    int64_t t13  = (((int64_t)a[ 0]) * a[13]
-                 +  ((int64_t)a[ 1]) * a[12]
-                 +  ((int64_t)a[ 2]) * a[11]
-                 +  ((int64_t)a[ 3]) * a[10]
-                 +  ((int64_t)a[ 4]) * a[ 9]
-                 +  ((int64_t)a[ 5]) * a[ 8]
-                 +  ((int64_t)a[ 6]) * a[ 7]) * 2;
-    int64_t t14  = (((int64_t)a[ 0]) * a[14]
-                 +  ((int64_t)a[ 1]) * a[13]
-                 +  ((int64_t)a[ 2]) * a[12]
-                 +  ((int64_t)a[ 3]) * a[11]
-                 +  ((int64_t)a[ 4]) * a[10]
-                 +  ((int64_t)a[ 5]) * a[ 9]
-                 +  ((int64_t)a[ 6]) * a[ 8]) * 2
-                 +  ((int64_t)a[ 7]) * a[ 7];
-    int64_t t15  = (((int64_t)a[ 1]) * a[14]
-                 +  ((int64_t)a[ 2]) * a[13]
-                 +  ((int64_t)a[ 3]) * a[12]
-                 +  ((int64_t)a[ 4]) * a[11]
-                 +  ((int64_t)a[ 5]) * a[10]
-                 +  ((int64_t)a[ 6]) * a[ 9]
-                 +  ((int64_t)a[ 7]) * a[ 8]) * 2;
-    int64_t t16  = (((int64_t)a[ 2]) * a[14]
-                 +  ((int64_t)a[ 3]) * a[13]
-                 +  ((int64_t)a[ 4]) * a[12]
-                 +  ((int64_t)a[ 5]) * a[11]
-                 +  ((int64_t)a[ 6]) * a[10]
-                 +  ((int64_t)a[ 7]) * a[ 9]) * 2
-                 +  ((int64_t)a[ 8]) * a[ 8];
-    int64_t t17  = (((int64_t)a[ 3]) * a[14]
-                 +  ((int64_t)a[ 4]) * a[13]
-                 +  ((int64_t)a[ 5]) * a[12]
-                 +  ((int64_t)a[ 6]) * a[11]
-                 +  ((int64_t)a[ 7]) * a[10]
-                 +  ((int64_t)a[ 8]) * a[ 9]) * 2;
-    int64_t t18  = (((int64_t)a[ 4]) * a[14]
-                 +  ((int64_t)a[ 5]) * a[13]
-                 +  ((int64_t)a[ 6]) * a[12]
-                 +  ((int64_t)a[ 7]) * a[11]
-                 +  ((int64_t)a[ 8]) * a[10]) * 2
-                 +  ((int64_t)a[ 9]) * a[ 9];
-    int64_t t19  = (((int64_t)a[ 5]) * a[14]
-                 +  ((int64_t)a[ 6]) * a[13]
-                 +  ((int64_t)a[ 7]) * a[12]
-                 +  ((int64_t)a[ 8]) * a[11]
-                 +  ((int64_t)a[ 9]) * a[10]) * 2;
-    int64_t t20  = (((int64_t)a[ 6]) * a[14]
-                 +  ((int64_t)a[ 7]) * a[13]
-                 +  ((int64_t)a[ 8]) * a[12]
-                 +  ((int64_t)a[ 9]) * a[11]) * 2
-                 +  ((int64_t)a[10]) * a[10];
-    int64_t t21  = (((int64_t)a[ 7]) * a[14]
-                 +  ((int64_t)a[ 8]) * a[13]
-                 +  ((int64_t)a[ 9]) * a[12]
-                 +  ((int64_t)a[10]) * a[11]) * 2;
-    int64_t t22  = (((int64_t)a[ 8]) * a[14]
-                 +  ((int64_t)a[ 9]) * a[13]
-                 +  ((int64_t)a[10]) * a[12]) * 2
-                 +  ((int64_t)a[11]) * a[11];
-    int64_t t23  = (((int64_t)a[ 9]) * a[14]
-                 +  ((int64_t)a[10]) * a[13]
-                 +  ((int64_t)a[11]) * a[12]) * 2;
-    int64_t t24  = (((int64_t)a[10]) * a[14]
-                 +  ((int64_t)a[11]) * a[13]) * 2
-                 +  ((int64_t)a[12]) * a[12];
-    int64_t t25  = (((int64_t)a[11]) * a[14]
-                 +  ((int64_t)a[12]) * a[13]) * 2;
-    int64_t t26  = (((int64_t)a[12]) * a[14]) * 2
-                 +  ((int64_t)a[13]) * a[13];
-    int64_t t27  = (((int64_t)a[13]) * a[14]) * 2;
-    int64_t t28  =  ((int64_t)a[14]) * a[14];
+    sp_int64 t0   =  ((sp_int64)a[ 0]) * a[ 0];
+    sp_int64 t1   = (((sp_int64)a[ 0]) * a[ 1]) * 2;
+    sp_int64 t2   = (((sp_int64)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_int64)a[ 1]) * a[ 1];
+    sp_int64 t3   = (((sp_int64)a[ 0]) * a[ 3]
+                 +  ((sp_int64)a[ 1]) * a[ 2]) * 2;
+    sp_int64 t4   = (((sp_int64)a[ 0]) * a[ 4]
+                 +  ((sp_int64)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_int64)a[ 2]) * a[ 2];
+    sp_int64 t5   = (((sp_int64)a[ 0]) * a[ 5]
+                 +  ((sp_int64)a[ 1]) * a[ 4]
+                 +  ((sp_int64)a[ 2]) * a[ 3]) * 2;
+    sp_int64 t6   = (((sp_int64)a[ 0]) * a[ 6]
+                 +  ((sp_int64)a[ 1]) * a[ 5]
+                 +  ((sp_int64)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_int64)a[ 3]) * a[ 3];
+    sp_int64 t7   = (((sp_int64)a[ 0]) * a[ 7]
+                 +  ((sp_int64)a[ 1]) * a[ 6]
+                 +  ((sp_int64)a[ 2]) * a[ 5]
+                 +  ((sp_int64)a[ 3]) * a[ 4]) * 2;
+    sp_int64 t8   = (((sp_int64)a[ 0]) * a[ 8]
+                 +  ((sp_int64)a[ 1]) * a[ 7]
+                 +  ((sp_int64)a[ 2]) * a[ 6]
+                 +  ((sp_int64)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_int64)a[ 4]) * a[ 4];
+    sp_int64 t9   = (((sp_int64)a[ 0]) * a[ 9]
+                 +  ((sp_int64)a[ 1]) * a[ 8]
+                 +  ((sp_int64)a[ 2]) * a[ 7]
+                 +  ((sp_int64)a[ 3]) * a[ 6]
+                 +  ((sp_int64)a[ 4]) * a[ 5]) * 2;
+    sp_int64 t10  = (((sp_int64)a[ 0]) * a[10]
+                 +  ((sp_int64)a[ 1]) * a[ 9]
+                 +  ((sp_int64)a[ 2]) * a[ 8]
+                 +  ((sp_int64)a[ 3]) * a[ 7]
+                 +  ((sp_int64)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_int64)a[ 5]) * a[ 5];
+    sp_int64 t11  = (((sp_int64)a[ 0]) * a[11]
+                 +  ((sp_int64)a[ 1]) * a[10]
+                 +  ((sp_int64)a[ 2]) * a[ 9]
+                 +  ((sp_int64)a[ 3]) * a[ 8]
+                 +  ((sp_int64)a[ 4]) * a[ 7]
+                 +  ((sp_int64)a[ 5]) * a[ 6]) * 2;
+    sp_int64 t12  = (((sp_int64)a[ 0]) * a[12]
+                 +  ((sp_int64)a[ 1]) * a[11]
+                 +  ((sp_int64)a[ 2]) * a[10]
+                 +  ((sp_int64)a[ 3]) * a[ 9]
+                 +  ((sp_int64)a[ 4]) * a[ 8]
+                 +  ((sp_int64)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_int64)a[ 6]) * a[ 6];
+    sp_int64 t13  = (((sp_int64)a[ 0]) * a[13]
+                 +  ((sp_int64)a[ 1]) * a[12]
+                 +  ((sp_int64)a[ 2]) * a[11]
+                 +  ((sp_int64)a[ 3]) * a[10]
+                 +  ((sp_int64)a[ 4]) * a[ 9]
+                 +  ((sp_int64)a[ 5]) * a[ 8]
+                 +  ((sp_int64)a[ 6]) * a[ 7]) * 2;
+    sp_int64 t14  = (((sp_int64)a[ 0]) * a[14]
+                 +  ((sp_int64)a[ 1]) * a[13]
+                 +  ((sp_int64)a[ 2]) * a[12]
+                 +  ((sp_int64)a[ 3]) * a[11]
+                 +  ((sp_int64)a[ 4]) * a[10]
+                 +  ((sp_int64)a[ 5]) * a[ 9]
+                 +  ((sp_int64)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_int64)a[ 7]) * a[ 7];
+    sp_int64 t15  = (((sp_int64)a[ 1]) * a[14]
+                 +  ((sp_int64)a[ 2]) * a[13]
+                 +  ((sp_int64)a[ 3]) * a[12]
+                 +  ((sp_int64)a[ 4]) * a[11]
+                 +  ((sp_int64)a[ 5]) * a[10]
+                 +  ((sp_int64)a[ 6]) * a[ 9]
+                 +  ((sp_int64)a[ 7]) * a[ 8]) * 2;
+    sp_int64 t16  = (((sp_int64)a[ 2]) * a[14]
+                 +  ((sp_int64)a[ 3]) * a[13]
+                 +  ((sp_int64)a[ 4]) * a[12]
+                 +  ((sp_int64)a[ 5]) * a[11]
+                 +  ((sp_int64)a[ 6]) * a[10]
+                 +  ((sp_int64)a[ 7]) * a[ 9]) * 2
+                 +  ((sp_int64)a[ 8]) * a[ 8];
+    sp_int64 t17  = (((sp_int64)a[ 3]) * a[14]
+                 +  ((sp_int64)a[ 4]) * a[13]
+                 +  ((sp_int64)a[ 5]) * a[12]
+                 +  ((sp_int64)a[ 6]) * a[11]
+                 +  ((sp_int64)a[ 7]) * a[10]
+                 +  ((sp_int64)a[ 8]) * a[ 9]) * 2;
+    sp_int64 t18  = (((sp_int64)a[ 4]) * a[14]
+                 +  ((sp_int64)a[ 5]) * a[13]
+                 +  ((sp_int64)a[ 6]) * a[12]
+                 +  ((sp_int64)a[ 7]) * a[11]
+                 +  ((sp_int64)a[ 8]) * a[10]) * 2
+                 +  ((sp_int64)a[ 9]) * a[ 9];
+    sp_int64 t19  = (((sp_int64)a[ 5]) * a[14]
+                 +  ((sp_int64)a[ 6]) * a[13]
+                 +  ((sp_int64)a[ 7]) * a[12]
+                 +  ((sp_int64)a[ 8]) * a[11]
+                 +  ((sp_int64)a[ 9]) * a[10]) * 2;
+    sp_int64 t20  = (((sp_int64)a[ 6]) * a[14]
+                 +  ((sp_int64)a[ 7]) * a[13]
+                 +  ((sp_int64)a[ 8]) * a[12]
+                 +  ((sp_int64)a[ 9]) * a[11]) * 2
+                 +  ((sp_int64)a[10]) * a[10];
+    sp_int64 t21  = (((sp_int64)a[ 7]) * a[14]
+                 +  ((sp_int64)a[ 8]) * a[13]
+                 +  ((sp_int64)a[ 9]) * a[12]
+                 +  ((sp_int64)a[10]) * a[11]) * 2;
+    sp_int64 t22  = (((sp_int64)a[ 8]) * a[14]
+                 +  ((sp_int64)a[ 9]) * a[13]
+                 +  ((sp_int64)a[10]) * a[12]) * 2
+                 +  ((sp_int64)a[11]) * a[11];
+    sp_int64 t23  = (((sp_int64)a[ 9]) * a[14]
+                 +  ((sp_int64)a[10]) * a[13]
+                 +  ((sp_int64)a[11]) * a[12]) * 2;
+    sp_int64 t24  = (((sp_int64)a[10]) * a[14]
+                 +  ((sp_int64)a[11]) * a[13]) * 2
+                 +  ((sp_int64)a[12]) * a[12];
+    sp_int64 t25  = (((sp_int64)a[11]) * a[14]
+                 +  ((sp_int64)a[12]) * a[13]) * 2;
+    sp_int64 t26  = (((sp_int64)a[12]) * a[14]) * 2
+                 +  ((sp_int64)a[13]) * a[13];
+    sp_int64 t27  = (((sp_int64)a[13]) * a[14]) * 2;
+    sp_int64 t28  =  ((sp_int64)a[14]) * a[14];
 
     t1   += t0  >> 26; r[ 0] = t0  & 0x3ffffff;
     t2   += t1  >> 26; r[ 1] = t1  & 0x3ffffff;
@@ -20413,19 +27800,37 @@ SP_NOINLINE static void sp_384_mul_add_15(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t[4];
     int i;
 
-    for (i = 0; i < 15; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x3ffffff;
-        t >>= 26;
+    t[0] = 0;
+    for (i = 0; i < 12; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x3ffffff;
+        t[1] += t[0] >> 26;
+        r[i+1] = t[1] & 0x3ffffff;
+        t[2] += t[1] >> 26;
+        r[i+2] = t[2] & 0x3ffffff;
+        t[3] += t[2] >> 26;
+        r[i+3] = t[3] & 0x3ffffff;
+        t[0]  = t[3] >> 26;
     }
-    r[15] += (sp_digit)t;
+    t[0] += (tb * a[12]) + r[12];
+    t[1]  = (tb * a[13]) + r[13];
+    t[2]  = (tb * a[14]) + r[14];
+    r[12] = t[0] & 0x3ffffff;
+    t[1] += t[0] >> 26;
+    r[13] = t[1] & 0x3ffffff;
+    t[2] += t[1] >> 26;
+    r[14] = t[2] & 0x3ffffff;
+    r[15] +=  (sp_digit)(t[2] >> 26);
 #else
-    int64_t tb = b;
-    int64_t t[15];
+    sp_int64 tb = b;
+    sp_int64 t[15];
 
     t[ 0] = tb * a[ 0];
     t[ 1] = tb * a[ 1];
@@ -20461,7 +27866,7 @@ SP_NOINLINE static void sp_384_mul_add_15(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-/* Normalize the values in each word to 26.
+/* Normalize the values in each word to 26 bits.
  *
  * a  Array of sp_digit to normalize.
  */
@@ -20488,7 +27893,7 @@ static void sp_384_norm_15(sp_digit* a)
     a[12] += a[11] >> 26; a[11] &= 0x3ffffff;
     a[13] += a[12] >> 26; a[12] &= 0x3ffffff;
     a[14] += a[13] >> 26; a[13] &= 0x3ffffff;
-#endif
+#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Shift the result in the high 384 bits down to the bottom.
@@ -20500,32 +27905,32 @@ static void sp_384_mont_shift_15(sp_digit* r, const sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
-    int64_t n = a[14] >> 20;
-    n += ((int64_t)a[15]) << 6;
+    sp_int64 n = a[14] >> 20;
+    n += ((sp_int64)a[15]) << 6;
 
     for (i = 0; i < 14; i++) {
         r[i] = n & 0x3ffffff;
         n >>= 26;
-        n += ((int64_t)a[16 + i]) << 6;
+        n += ((sp_int64)a[16 + i]) << 6;
     }
     r[14] = (sp_digit)n;
 #else
-    int64_t n = a[14] >> 20;
-    n += ((int64_t)a[15]) << 6;
-    r[ 0] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[16]) << 6;
-    r[ 1] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[17]) << 6;
-    r[ 2] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[18]) << 6;
-    r[ 3] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[19]) << 6;
-    r[ 4] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[20]) << 6;
-    r[ 5] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[21]) << 6;
-    r[ 6] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[22]) << 6;
-    r[ 7] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[23]) << 6;
-    r[ 8] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[24]) << 6;
-    r[ 9] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[25]) << 6;
-    r[10] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[26]) << 6;
-    r[11] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[27]) << 6;
-    r[12] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[28]) << 6;
-    r[13] = n & 0x3ffffff; n >>= 26; n += ((int64_t)a[29]) << 6;
+    sp_int64 n = a[14] >> 20;
+    n += ((sp_int64)a[15]) << 6;
+    r[ 0] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[16]) << 6;
+    r[ 1] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[17]) << 6;
+    r[ 2] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[18]) << 6;
+    r[ 3] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[19]) << 6;
+    r[ 4] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[20]) << 6;
+    r[ 5] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[21]) << 6;
+    r[ 6] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[22]) << 6;
+    r[ 7] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[23]) << 6;
+    r[ 8] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[24]) << 6;
+    r[ 9] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[25]) << 6;
+    r[10] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[26]) << 6;
+    r[11] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[27]) << 6;
+    r[12] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[28]) << 6;
+    r[13] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[29]) << 6;
     r[14] = (sp_digit)n;
 #endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[15], 0, sizeof(*r) * 15U);
@@ -20712,7 +28117,7 @@ static void sp_384_map_15(sp_point_384* r, const sp_point_384* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*15;
-    int32_t n;
+    sp_int32 n;
 
     sp_384_mont_inv_15(t1, p->z, t + 2*15);
 
@@ -25253,7 +32658,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_384_to_bin(sp_digit* r, byte* a)
+static void sp_384_to_bin_15(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -25340,7 +32745,7 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_384_ecc_mulmod_15(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_384_to_bin(point->x, out);
+        sp_384_to_bin_15(point->x, out);
         *outLen = 48;
     }
 
@@ -25358,6 +32763,36 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
 #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
 #endif
 #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
+SP_NOINLINE static void sp_384_rshift_15(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+#ifdef WOLFSSL_SP_SMALL
+    for (i=0; i<14; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (26 - n))) & 0x3ffffff;
+    }
+#else
+    for (i=0; i<8; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (26 - n)) & 0x3ffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (26 - n)) & 0x3ffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (26 - n)) & 0x3ffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (26 - n)) & 0x3ffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (26 - n)) & 0x3ffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (26 - n)) & 0x3ffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (26 - n)) & 0x3ffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (26 - n)) & 0x3ffffff);
+    }
+    r[8] = (a[8] >> n) | ((a[9] << (26 - n)) & 0x3ffffff);
+    r[9] = (a[9] >> n) | ((a[10] << (26 - n)) & 0x3ffffff);
+    r[10] = (a[10] >> n) | ((a[11] << (26 - n)) & 0x3ffffff);
+    r[11] = (a[11] >> n) | ((a[12] << (26 - n)) & 0x3ffffff);
+    r[12] = (a[12] >> n) | ((a[13] << (26 - n)) & 0x3ffffff);
+    r[13] = (a[13] >> n) | ((a[14] << (26 - n)) & 0x3ffffff);
+#endif /* WOLFSSL_SP_SMALL */
+    r[14] = a[14] >> n;
+}
+
 /* Multiply a by scalar b into r. (r = a * b)
  *
  * r  A single precision integer.
@@ -25368,8 +32803,8 @@ SP_NOINLINE static void sp_384_mul_d_15(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
     for (i = 0; i < 15; i++) {
@@ -25379,8 +32814,8 @@ SP_NOINLINE static void sp_384_mul_d_15(sp_digit* r, const sp_digit* a,
     }
     r[15] = (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t[15];
+    sp_int64 tb = b;
+    sp_int64 t[15];
 
     t[ 0] = tb * a[ 0];
     t[ 1] = tb * a[ 1];
@@ -25416,67 +32851,88 @@ SP_NOINLINE static void sp_384_mul_d_15(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-#ifdef WOLFSSL_SP_DIV_32
-static WC_INLINE sp_digit sp_384_div_word_15(sp_digit d1, sp_digit d0,
-    sp_digit dv)
+SP_NOINLINE static void sp_384_lshift_30(sp_digit* r, const sp_digit* a,
+        byte n)
 {
-    sp_digit d;
-    sp_digit r;
-    sp_digit t;
+#ifdef WOLFSSL_SP_SMALL
+    int i;
 
-    /* All 26 bits from d1 and top 5 bits from d0. */
-    d = (d1 << 5) + (d0 >> 21);
-    r = d / dv;
-    d -= r * dv;
-    /* Up to 6 bits in r */
-    /* Next 5 bits from d0. */
-    r <<= 5;
-    d <<= 5;
-    d += (d0 >> 16) & ((1 << 5) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 11 bits in r */
-    /* Next 5 bits from d0. */
-    r <<= 5;
-    d <<= 5;
-    d += (d0 >> 11) & ((1 << 5) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 16 bits in r */
-    /* Next 5 bits from d0. */
-    r <<= 5;
-    d <<= 5;
-    d += (d0 >> 6) & ((1 << 5) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 21 bits in r */
-    /* Next 5 bits from d0. */
-    r <<= 5;
-    d <<= 5;
-    d += (d0 >> 1) & ((1 << 5) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 26 bits in r */
-    /* Remaining 1 bits from d0. */
-    r <<= 1;
-    d <<= 1;
-    d += d0 & ((1 << 1) - 1);
-    t = d / dv;
-    r += t;
+    r[30] = a[29] >> (26 - n);
+    for (i=29; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (26 - n))) & 0x3ffffff;
+    }
+#else
+    sp_int_digit s;
+    sp_int_digit t;
 
-    /* All 26 bits from d1 and top 5 bits from d0. */
-    return r;
+    s = (sp_int_digit)a[29];
+    r[30] = s >> (26U - n);
+    s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
+    r[29] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
+    r[28] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
+    r[27] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
+    r[26] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
+    r[25] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
+    r[24] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
+    r[23] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
+    r[22] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
+    r[21] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
+    r[20] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
+    r[19] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
+    r[18] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
+    r[17] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
+    r[16] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
+    r[15] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
+    r[14] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
+    r[13] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
+    r[12] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
+    r[11] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
+    r[10] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
+    r[9] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
+    r[8] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
+    r[7] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
+    r[6] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
+    r[5] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
+    r[4] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
+    r[3] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
+    r[2] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
+    r[1] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
+#endif /* WOLFSSL_SP_SMALL */
+    r[0] = (a[0] << n) & 0x3ffffff;
 }
-#endif /* WOLFSSL_SP_DIV_32 */
 
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Simplified based on top word of divisor being (1 << 26) - 1
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -25484,75 +32940,54 @@ static WC_INLINE sp_digit sp_384_div_word_15(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_384_div_15(const sp_digit* a, const sp_digit* d, 
+static int sp_384_div_15(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
-#ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
-#endif
-    sp_digit dv;
     sp_digit r1;
+    sp_digit mask;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 15 + 1];
+    sp_digit t1[4 * 15 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 15 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 15 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
+    (void)m;
+
     if (err == MP_OKAY) {
-        t2 = t1 + 2 * 15;
+        t2 = t1 + 30 + 1;
+        sd = t2 + 15 + 1;
 
-        dv = d[14];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 15U);
+        sp_384_mul_d_15(sd, d, (sp_digit)1 << 6);
+        sp_384_lshift_30(t1, a, 6);
+        t1[15 + 15] += t1[15 + 15 - 1] >> 26;
+        t1[15 + 15 - 1] &= 0x3ffffff;
         for (i=14; i>=0; i--) {
-            t1[15 + i] += t1[15 + i - 1] >> 26;
-            t1[15 + i - 1] &= 0x3ffffff;
-#ifndef WOLFSSL_SP_DIV_32
-            d1 = t1[15 + i];
-            d1 <<= 26;
-            d1 += t1[15 + i - 1];
-            r1 = (sp_digit)(d1 / dv);
-#else
-            r1 = sp_384_div_word_15(t1[15 + i], t1[15 + i - 1], dv);
-#endif
-
-            sp_384_mul_d_15(t2, d, r1);
+            r1 = t1[15 + i];
+            sp_384_mul_d_15(t2, sd, r1);
             (void)sp_384_sub_15(&t1[i], &t1[i], t2);
-            sp_384_norm_15(&t1[i]);
             t1[15 + i] -= t2[15];
-            t1[15 + i] += t1[15 + i - 1] >> 26;
-            t1[15 + i - 1] &= 0x3ffffff;
-            r1 = (((-t1[15 + i]) << 26) - t1[15 + i - 1]) / dv;
-            r1++;
-            sp_384_mul_d_15(t2, d, r1);
-            (void)sp_384_add_15(&t1[i], &t1[i], t2);
-            t1[15 + i] += t1[15 + i - 1] >> 26;
-            t1[15 + i - 1] &= 0x3ffffff;
-        }
-        t1[15 - 1] += t1[15 - 2] >> 26;
-        t1[15 - 2] &= 0x3ffffff;
-        r1 = t1[15 - 1] / dv;
+            sp_384_norm_15(&t1[i + 1]);
 
-        sp_384_mul_d_15(t2, d, r1);
-        (void)sp_384_sub_15(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 30U);
-        for (i=0; i<14; i++) {
-            r[i+1] += r[i] >> 26;
-            r[i] &= 0x3ffffff;
+            mask = (sp_digit)0 - ((t1[15 + i] > 0) ?
+                    (sp_digit)1 : (sp_digit)0);
+            sp_384_cond_sub_15(t1 + i, t1 + i, sd, mask);
+            sp_384_norm_15(&t1[i + 1]);
         }
-        sp_384_cond_add_15(r, r, d, 0 - ((r[14] < 0) ?
-                    (sp_digit)1 : (sp_digit)0));
+        sp_384_norm_15(t1);
+        sp_384_rshift_15(r, t1, 6);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -25766,7 +33201,7 @@ static int sp_384_calc_s_15(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int32_t c;
+    sp_int32 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -25878,7 +33313,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int32_t c;
+        sp_int32 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 15U);
         sp_384_norm_15(ctx->r);
@@ -25927,7 +33362,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int32_t c;
+        sp_int32 c;
         sp_384_norm_15(ctx->x);
         carry = sp_384_add_15(ctx->s, ctx->e, ctx->x);
         sp_384_cond_sub_15(ctx->s, ctx->s,
@@ -25997,7 +33432,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int32_t c;
+    sp_int32 c;
     int err = MP_OKAY;
     int i;
 
@@ -26481,7 +33916,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int32_t c = 0;
+        sp_int32 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_384_cmp_15(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -26536,7 +33971,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_384* p2 = NULL;
     sp_digit carry;
-    int32_t c = 0;
+    sp_int32 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -27228,55 +34663,55 @@ typedef struct sp_point_1024 {
 SP_NOINLINE static void sp_1024_mul_7(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int64_t t0   = ((int64_t)a[ 0]) * b[ 0];
-    int64_t t1   = ((int64_t)a[ 0]) * b[ 1]
-                 + ((int64_t)a[ 1]) * b[ 0];
-    int64_t t2   = ((int64_t)a[ 0]) * b[ 2]
-                 + ((int64_t)a[ 1]) * b[ 1]
-                 + ((int64_t)a[ 2]) * b[ 0];
-    int64_t t3   = ((int64_t)a[ 0]) * b[ 3]
-                 + ((int64_t)a[ 1]) * b[ 2]
-                 + ((int64_t)a[ 2]) * b[ 1]
-                 + ((int64_t)a[ 3]) * b[ 0];
-    int64_t t4   = ((int64_t)a[ 0]) * b[ 4]
-                 + ((int64_t)a[ 1]) * b[ 3]
-                 + ((int64_t)a[ 2]) * b[ 2]
-                 + ((int64_t)a[ 3]) * b[ 1]
-                 + ((int64_t)a[ 4]) * b[ 0];
-    int64_t t5   = ((int64_t)a[ 0]) * b[ 5]
-                 + ((int64_t)a[ 1]) * b[ 4]
-                 + ((int64_t)a[ 2]) * b[ 3]
-                 + ((int64_t)a[ 3]) * b[ 2]
-                 + ((int64_t)a[ 4]) * b[ 1]
-                 + ((int64_t)a[ 5]) * b[ 0];
-    int64_t t6   = ((int64_t)a[ 0]) * b[ 6]
-                 + ((int64_t)a[ 1]) * b[ 5]
-                 + ((int64_t)a[ 2]) * b[ 4]
-                 + ((int64_t)a[ 3]) * b[ 3]
-                 + ((int64_t)a[ 4]) * b[ 2]
-                 + ((int64_t)a[ 5]) * b[ 1]
-                 + ((int64_t)a[ 6]) * b[ 0];
-    int64_t t7   = ((int64_t)a[ 1]) * b[ 6]
-                 + ((int64_t)a[ 2]) * b[ 5]
-                 + ((int64_t)a[ 3]) * b[ 4]
-                 + ((int64_t)a[ 4]) * b[ 3]
-                 + ((int64_t)a[ 5]) * b[ 2]
-                 + ((int64_t)a[ 6]) * b[ 1];
-    int64_t t8   = ((int64_t)a[ 2]) * b[ 6]
-                 + ((int64_t)a[ 3]) * b[ 5]
-                 + ((int64_t)a[ 4]) * b[ 4]
-                 + ((int64_t)a[ 5]) * b[ 3]
-                 + ((int64_t)a[ 6]) * b[ 2];
-    int64_t t9   = ((int64_t)a[ 3]) * b[ 6]
-                 + ((int64_t)a[ 4]) * b[ 5]
-                 + ((int64_t)a[ 5]) * b[ 4]
-                 + ((int64_t)a[ 6]) * b[ 3];
-    int64_t t10  = ((int64_t)a[ 4]) * b[ 6]
-                 + ((int64_t)a[ 5]) * b[ 5]
-                 + ((int64_t)a[ 6]) * b[ 4];
-    int64_t t11  = ((int64_t)a[ 5]) * b[ 6]
-                 + ((int64_t)a[ 6]) * b[ 5];
-    int64_t t12  = ((int64_t)a[ 6]) * b[ 6];
+    sp_int64 t0   = ((sp_int64)a[ 0]) * b[ 0];
+    sp_int64 t1   = ((sp_int64)a[ 0]) * b[ 1]
+                 + ((sp_int64)a[ 1]) * b[ 0];
+    sp_int64 t2   = ((sp_int64)a[ 0]) * b[ 2]
+                 + ((sp_int64)a[ 1]) * b[ 1]
+                 + ((sp_int64)a[ 2]) * b[ 0];
+    sp_int64 t3   = ((sp_int64)a[ 0]) * b[ 3]
+                 + ((sp_int64)a[ 1]) * b[ 2]
+                 + ((sp_int64)a[ 2]) * b[ 1]
+                 + ((sp_int64)a[ 3]) * b[ 0];
+    sp_int64 t4   = ((sp_int64)a[ 0]) * b[ 4]
+                 + ((sp_int64)a[ 1]) * b[ 3]
+                 + ((sp_int64)a[ 2]) * b[ 2]
+                 + ((sp_int64)a[ 3]) * b[ 1]
+                 + ((sp_int64)a[ 4]) * b[ 0];
+    sp_int64 t5   = ((sp_int64)a[ 0]) * b[ 5]
+                 + ((sp_int64)a[ 1]) * b[ 4]
+                 + ((sp_int64)a[ 2]) * b[ 3]
+                 + ((sp_int64)a[ 3]) * b[ 2]
+                 + ((sp_int64)a[ 4]) * b[ 1]
+                 + ((sp_int64)a[ 5]) * b[ 0];
+    sp_int64 t6   = ((sp_int64)a[ 0]) * b[ 6]
+                 + ((sp_int64)a[ 1]) * b[ 5]
+                 + ((sp_int64)a[ 2]) * b[ 4]
+                 + ((sp_int64)a[ 3]) * b[ 3]
+                 + ((sp_int64)a[ 4]) * b[ 2]
+                 + ((sp_int64)a[ 5]) * b[ 1]
+                 + ((sp_int64)a[ 6]) * b[ 0];
+    sp_int64 t7   = ((sp_int64)a[ 1]) * b[ 6]
+                 + ((sp_int64)a[ 2]) * b[ 5]
+                 + ((sp_int64)a[ 3]) * b[ 4]
+                 + ((sp_int64)a[ 4]) * b[ 3]
+                 + ((sp_int64)a[ 5]) * b[ 2]
+                 + ((sp_int64)a[ 6]) * b[ 1];
+    sp_int64 t8   = ((sp_int64)a[ 2]) * b[ 6]
+                 + ((sp_int64)a[ 3]) * b[ 5]
+                 + ((sp_int64)a[ 4]) * b[ 4]
+                 + ((sp_int64)a[ 5]) * b[ 3]
+                 + ((sp_int64)a[ 6]) * b[ 2];
+    sp_int64 t9   = ((sp_int64)a[ 3]) * b[ 6]
+                 + ((sp_int64)a[ 4]) * b[ 5]
+                 + ((sp_int64)a[ 5]) * b[ 4]
+                 + ((sp_int64)a[ 6]) * b[ 3];
+    sp_int64 t10  = ((sp_int64)a[ 4]) * b[ 6]
+                 + ((sp_int64)a[ 5]) * b[ 5]
+                 + ((sp_int64)a[ 6]) * b[ 4];
+    sp_int64 t11  = ((sp_int64)a[ 5]) * b[ 6]
+                 + ((sp_int64)a[ 6]) * b[ 5];
+    sp_int64 t12  = ((sp_int64)a[ 6]) * b[ 6];
 
     t1   += t0  >> 25; r[ 0] = t0  & 0x1ffffff;
     t2   += t1  >> 25; r[ 1] = t1  & 0x1ffffff;
@@ -27301,34 +34736,34 @@ SP_NOINLINE static void sp_1024_mul_7(sp_digit* r, const sp_digit* a,
  */
 SP_NOINLINE static void sp_1024_sqr_7(sp_digit* r, const sp_digit* a)
 {
-    int64_t t0   =  ((int64_t)a[ 0]) * a[ 0];
-    int64_t t1   = (((int64_t)a[ 0]) * a[ 1]) * 2;
-    int64_t t2   = (((int64_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int64_t)a[ 1]) * a[ 1];
-    int64_t t3   = (((int64_t)a[ 0]) * a[ 3]
-                 +  ((int64_t)a[ 1]) * a[ 2]) * 2;
-    int64_t t4   = (((int64_t)a[ 0]) * a[ 4]
-                 +  ((int64_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int64_t)a[ 2]) * a[ 2];
-    int64_t t5   = (((int64_t)a[ 0]) * a[ 5]
-                 +  ((int64_t)a[ 1]) * a[ 4]
-                 +  ((int64_t)a[ 2]) * a[ 3]) * 2;
-    int64_t t6   = (((int64_t)a[ 0]) * a[ 6]
-                 +  ((int64_t)a[ 1]) * a[ 5]
-                 +  ((int64_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int64_t)a[ 3]) * a[ 3];
-    int64_t t7   = (((int64_t)a[ 1]) * a[ 6]
-                 +  ((int64_t)a[ 2]) * a[ 5]
-                 +  ((int64_t)a[ 3]) * a[ 4]) * 2;
-    int64_t t8   = (((int64_t)a[ 2]) * a[ 6]
-                 +  ((int64_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int64_t)a[ 4]) * a[ 4];
-    int64_t t9   = (((int64_t)a[ 3]) * a[ 6]
-                 +  ((int64_t)a[ 4]) * a[ 5]) * 2;
-    int64_t t10  = (((int64_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int64_t)a[ 5]) * a[ 5];
-    int64_t t11  = (((int64_t)a[ 5]) * a[ 6]) * 2;
-    int64_t t12  =  ((int64_t)a[ 6]) * a[ 6];
+    sp_int64 t0   =  ((sp_int64)a[ 0]) * a[ 0];
+    sp_int64 t1   = (((sp_int64)a[ 0]) * a[ 1]) * 2;
+    sp_int64 t2   = (((sp_int64)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_int64)a[ 1]) * a[ 1];
+    sp_int64 t3   = (((sp_int64)a[ 0]) * a[ 3]
+                 +  ((sp_int64)a[ 1]) * a[ 2]) * 2;
+    sp_int64 t4   = (((sp_int64)a[ 0]) * a[ 4]
+                 +  ((sp_int64)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_int64)a[ 2]) * a[ 2];
+    sp_int64 t5   = (((sp_int64)a[ 0]) * a[ 5]
+                 +  ((sp_int64)a[ 1]) * a[ 4]
+                 +  ((sp_int64)a[ 2]) * a[ 3]) * 2;
+    sp_int64 t6   = (((sp_int64)a[ 0]) * a[ 6]
+                 +  ((sp_int64)a[ 1]) * a[ 5]
+                 +  ((sp_int64)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_int64)a[ 3]) * a[ 3];
+    sp_int64 t7   = (((sp_int64)a[ 1]) * a[ 6]
+                 +  ((sp_int64)a[ 2]) * a[ 5]
+                 +  ((sp_int64)a[ 3]) * a[ 4]) * 2;
+    sp_int64 t8   = (((sp_int64)a[ 2]) * a[ 6]
+                 +  ((sp_int64)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_int64)a[ 4]) * a[ 4];
+    sp_int64 t9   = (((sp_int64)a[ 3]) * a[ 6]
+                 +  ((sp_int64)a[ 4]) * a[ 5]) * 2;
+    sp_int64 t10  = (((sp_int64)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_int64)a[ 5]) * a[ 5];
+    sp_int64 t11  = (((sp_int64)a[ 5]) * a[ 6]) * 2;
+    sp_int64 t12  =  ((sp_int64)a[ 6]) * a[ 6];
 
     t1   += t0  >> 25; r[ 0] = t0  & 0x1ffffff;
     t2   += t1  >> 25; r[ 1] = t1  & 0x1ffffff;
@@ -27651,30 +35086,33 @@ SP_NOINLINE static void sp_1024_mul_42(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 lo;
 
-    c = ((int64_t)a[41]) * b[41];
+    c = ((sp_uint64)a[41]) * b[41];
     r[83] = (sp_digit)(c >> 25);
-    c = (c & 0x1ffffff) << 25;
+    c &= 0x1ffffff;
     for (k = 81; k >= 0; k--) {
-        for (i = 41; i >= 0; i--) {
-            j = k - i;
-            if (j >= 42) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * b[j];
+        if (k >= 42) {
+            i = k - 41;
+            imax = 41;
         }
-        r[k + 2] += (sp_digit)(c >> 50);
-        r[k + 1] = (sp_digit)((c >> 25) & 0x1ffffff);
-        c = (c & 0x1ffffff) << 25;
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint64)a[i]) * b[k - i];
+        }
+        c += lo >> 25;
+        r[k + 2] += (sp_digit)(c >> 25);
+        r[k + 1]  = (sp_digit)(c & 0x1ffffff);
+        c = lo & 0x1ffffff;
     }
-    r[0] = (sp_digit)(c >> 25);
+    r[0] = (sp_digit)c;
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -27685,31 +35123,34 @@ SP_NOINLINE static void sp_1024_mul_42(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_1024_sqr_42(sp_digit* r, const sp_digit* a)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int64_t c;
+    sp_uint64 c;
+    sp_uint64 t;
 
-    c = ((int64_t)a[41]) * a[41];
+    c = ((sp_uint64)a[41]) * a[41];
     r[83] = (sp_digit)(c >> 25);
     c = (c & 0x1ffffff) << 25;
     for (k = 81; k >= 0; k--) {
-        for (i = 41; i >= 0; i--) {
-            j = k - i;
-            if (j >= 42 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int64_t)a[i]) * a[j] * 2;
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint64)a[i]) * a[i];
+           i++;
         }
-        if (i == j) {
-           c += ((int64_t)a[i]) * a[i];
+        if (k < 41) {
+            imax = k;
         }
+        else {
+            imax = 41;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint64)a[i]) * a[k - i];
+        }
+        c += t * 2;
 
-        r[k + 2] += (sp_digit)(c >> 50);
-        r[k + 1] = (sp_digit)((c >> 25) & 0x1ffffff);
+        r[k + 2] += (sp_digit) (c >> 50);
+        r[k + 1]  = (sp_digit)((c >> 25) & 0x1ffffff);
         c = (c & 0x1ffffff) << 25;
     }
     r[0] = (sp_digit)(c >> 25);
@@ -27807,15 +35248,15 @@ static const sp_point_1024 p1024_base = {
     0
 };
 
-/* Normalize the values in each word to 25.
+/* Normalize the values in each word to 25 bits.
  *
  * a  Array of sp_digit to normalize.
  */
-static void sp_1024_norm_42(sp_digit* a)
+static void sp_1024_norm_41(sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
-    for (i = 0; i < 41; i++) {
+    for (i = 0; i < 40; i++) {
         a[i+1] += a[i] >> 25;
         a[i] &= 0x1ffffff;
     }
@@ -27831,8 +35272,7 @@ static void sp_1024_norm_42(sp_digit* a)
         a[i+7] += a[i+6] >> 25; a[i+6] &= 0x1ffffff;
         a[i+8] += a[i+7] >> 25; a[i+7] &= 0x1ffffff;
     }
-    a[40+1] += a[40] >> 25; a[40] &= 0x1ffffff;
-#endif
+#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -27845,8 +35285,8 @@ SP_NOINLINE static void sp_1024_mul_d_42(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     int i;
 
     for (i = 0; i < 42; i++) {
@@ -27856,10 +35296,10 @@ SP_NOINLINE static void sp_1024_mul_d_42(sp_digit* r, const sp_digit* a,
     }
     r[42] = (sp_digit)t;
 #else
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t = 0;
     sp_digit t2;
-    int64_t p[4];
+    sp_int64 p[4];
     int i;
 
     for (i = 0; i < 40; i += 4) {
@@ -27894,6 +35334,59 @@ SP_NOINLINE static void sp_1024_mul_d_42(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_1024_mul_d_84(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+#ifdef WOLFSSL_SP_SMALL
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    int i;
+
+    for (i = 0; i < 84; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x1ffffff);
+        t >>= 25;
+    }
+    r[84] = (sp_digit)t;
+#else
+    sp_int64 tb = b;
+    sp_int64 t = 0;
+    sp_digit t2;
+    sp_int64 p[4];
+    int i;
+
+    for (i = 0; i < 84; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0x1ffffff);
+        t >>= 25;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0x1ffffff);
+        t >>= 25;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0x1ffffff);
+        t >>= 25;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0x1ffffff);
+        t >>= 25;
+        r[i + 3] = (sp_digit)t2;
+    }
+    r[84] = (sp_digit)(t & 0x1ffffff);
+#endif /* WOLFSSL_SP_SMALL */
+}
+
 /* Conditionally add a and b using the mask m.
  * m is -1 to add and 0 when not.
  *
@@ -27908,7 +35401,7 @@ static void sp_1024_cond_add_42(sp_digit* r, const sp_digit* a,
 #ifdef WOLFSSL_SP_SMALL
     int i;
 
-    for (i = 0; i < 42; i++) {
+    for (i = 0; i < 41; i++) {
         r[i] = a[i] + (b[i] & m);
     }
 #else
@@ -27925,7 +35418,6 @@ static void sp_1024_cond_add_42(sp_digit* r, const sp_digit* a,
         r[i + 7] = a[i + 7] + (b[i + 7] & m);
     }
     r[40] = a[40] + (b[40] & m);
-    r[41] = a[41] + (b[41] & m);
 #endif /* WOLFSSL_SP_SMALL */
 }
 
@@ -27967,7 +35459,33 @@ SP_NOINLINE static int sp_1024_add_42(sp_digit* r, const sp_digit* a,
 
     return 0;
 }
-#endif
+#endif /* WOLFSSL_SP_SMALL */
+
+SP_NOINLINE static void sp_1024_rshift_42(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+#ifdef WOLFSSL_SP_SMALL
+    for (i=0; i<41; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (25 - n))) & 0x1ffffff;
+    }
+#else
+    for (i=0; i<40; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (25 - n)) & 0x1ffffff);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (25 - n)) & 0x1ffffff);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (25 - n)) & 0x1ffffff);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (25 - n)) & 0x1ffffff);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (25 - n)) & 0x1ffffff);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (25 - n)) & 0x1ffffff);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (25 - n)) & 0x1ffffff);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (25 - n)) & 0x1ffffff);
+    }
+    r[40] = (a[40] >> n) | ((a[41] << (25 - n)) & 0x1ffffff);
+#endif /* WOLFSSL_SP_SMALL */
+    r[41] = a[41] >> n;
+}
+
 #ifdef WOLFSSL_SP_DIV_32
 static WC_INLINE sp_digit sp_1024_div_word_42(sp_digit d1, sp_digit d0,
     sp_digit dv)
@@ -28020,7 +35538,7 @@ static WC_INLINE sp_digit sp_1024_div_word_42(sp_digit d1, sp_digit d0,
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Full implementation.
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -28028,40 +35546,45 @@ static WC_INLINE sp_digit sp_1024_div_word_42(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_1024_div_42(const sp_digit* a, const sp_digit* d, 
+static int sp_1024_div_42(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_32
-    int64_t d1;
+    sp_int64 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 42 + 1];
+    sp_digit t1[4 * 42 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 42 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 42 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
-    if (err == MP_OKAY) {
-        t2 = t1 + 2 * 42;
+    (void)m;
 
-        dv = d[40];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 42U);
-        for (i=40; i>=0; i--) {
-            t1[42 + i] += t1[42 + i - 1] >> 25;
-            t1[42 + i - 1] &= 0x1ffffff;
+    if (err == MP_OKAY) {
+        t2 = t1 + 84 + 1;
+        sd = t2 + 42 + 1;
+
+        sp_1024_mul_d_42(sd, d, (sp_digit)1 << 1);
+        sp_1024_mul_d_84(t1, a, (sp_digit)1 << 1);
+        dv = sd[40];
+        t1[41 + 41] += t1[41 + 41 - 1] >> 25;
+        t1[41 + 41 - 1] &= 0x1ffffff;
+        for (i=41; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_32
             d1 = t1[41 + i];
             d1 <<= 25;
@@ -28071,15 +35594,21 @@ static int sp_1024_div_42(const sp_digit* a, const sp_digit* d,
             r1 = sp_1024_div_word_42(t1[41 + i], t1[41 + i - 1], dv);
 #endif
 
-            sp_1024_mul_d_42(t2, d, r1);
+            sp_1024_mul_d_42(t2, sd, r1);
             (void)sp_1024_sub_42(&t1[i], &t1[i], t2);
-            sp_1024_norm_42(&t1[i]);
-            t1[42 + i] -= t2[42];
+            sp_1024_norm_41(&t1[i]);
             t1[41 + i] += t1[41 + i - 1] >> 25;
             t1[41 + i - 1] &= 0x1ffffff;
-            r1 = (((-t1[41 + i]) << 25) - t1[41 + i - 1]) / dv;
-            r1++;
-            sp_1024_mul_d_42(t2, d, r1);
+#ifndef WOLFSSL_SP_DIV_32
+            d1 = -t1[41 + i];
+            d1 <<= 25;
+            d1 -= t1[41 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_1024_div_word_42(-t1[41 + i], -t1[41 + i - 1], dv);
+#endif
+            r1 -= t1[41 + i];
+            sp_1024_mul_d_42(t2, sd, r1);
             (void)sp_1024_add_42(&t1[i], &t1[i], t2);
             t1[41 + i] += t1[41 + i - 1] >> 25;
             t1[41 + i - 1] &= 0x1ffffff;
@@ -28088,15 +35617,18 @@ static int sp_1024_div_42(const sp_digit* a, const sp_digit* d,
         t1[41 - 2] &= 0x1ffffff;
         r1 = t1[41 - 1] / dv;
 
-        sp_1024_mul_d_42(t2, d, r1);
-        (void)sp_1024_sub_42(t1, t1, t2);
+        sp_1024_mul_d_42(t2, sd, r1);
+        sp_1024_sub_42(t1, t1, t2);
         XMEMCPY(r, t1, sizeof(*r) * 84U);
         for (i=0; i<40; i++) {
             r[i+1] += r[i] >> 25;
             r[i] &= 0x1ffffff;
         }
-        sp_1024_cond_add_42(r, r, d, 0 - ((r[40] < 0) ?
+        sp_1024_cond_add_42(r, r, sd, 0 - ((r[40] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
+
+        sp_1024_norm_41(r);
+        sp_1024_rshift_42(r, r, 1);
         r[41] = 0;
     }
 
@@ -28468,19 +36000,34 @@ SP_NOINLINE static void sp_1024_mul_add_42(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int64_t tb = b;
-    int64_t t = 0;
+    sp_int64 tb = b;
+    sp_int64 t[4];
     int i;
 
-    for (i = 0; i < 42; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1ffffff;
-        t >>= 25;
+    t[0] = 0;
+    for (i = 0; i < 40; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1ffffff;
+        t[1] += t[0] >> 25;
+        r[i+1] = t[1] & 0x1ffffff;
+        t[2] += t[1] >> 25;
+        r[i+2] = t[2] & 0x1ffffff;
+        t[3] += t[2] >> 25;
+        r[i+3] = t[3] & 0x1ffffff;
+        t[0]  = t[3] >> 25;
     }
-    r[42] += (sp_digit)t;
+    t[0] += (tb * a[40]) + r[40];
+    t[1]  = (tb * a[41]) + r[41];
+    r[40] = t[0] & 0x1ffffff;
+    t[1] += t[0] >> 25;
+    r[41] = t[1] & 0x1ffffff;
+    r[42] +=  (sp_digit)(t[1] >> 25);
 #else
-    int64_t tb = b;
-    int64_t t[8];
+    sp_int64 tb = b;
+    sp_int64 t[8];
     int i;
 
     t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1ffffff);
@@ -28508,6 +36055,34 @@ SP_NOINLINE static void sp_1024_mul_add_42(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+/* Normalize the values in each word to 25 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_1024_norm_42(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 41; i++) {
+        a[i+1] += a[i] >> 25;
+        a[i] &= 0x1ffffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 40; i += 8) {
+        a[i+1] += a[i+0] >> 25; a[i+0] &= 0x1ffffff;
+        a[i+2] += a[i+1] >> 25; a[i+1] &= 0x1ffffff;
+        a[i+3] += a[i+2] >> 25; a[i+2] &= 0x1ffffff;
+        a[i+4] += a[i+3] >> 25; a[i+3] &= 0x1ffffff;
+        a[i+5] += a[i+4] >> 25; a[i+4] &= 0x1ffffff;
+        a[i+6] += a[i+5] >> 25; a[i+5] &= 0x1ffffff;
+        a[i+7] += a[i+6] >> 25; a[i+6] &= 0x1ffffff;
+        a[i+8] += a[i+7] >> 25; a[i+7] &= 0x1ffffff;
+    }
+    a[41] += a[40] >> 25; a[40] &= 0x1ffffff;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
 /* Shift the result in the high 1024 bits down to the bottom.
  *
  * r  A single precision number.
@@ -28517,33 +36092,33 @@ static void sp_1024_mont_shift_42(sp_digit* r, const sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
-    word32 n;
+    sp_uint32 n;
 
     n = a[40] >> 24;
     for (i = 0; i < 40; i++) {
-        n += (word32)a[41 + i] << 1;
+        n += (sp_uint32)a[41 + i] << 1;
         r[i] = n & 0x1ffffff;
         n >>= 25;
     }
-    n += (word32)a[81] << 1;
+    n += (sp_uint32)a[81] << 1;
     r[40] = n;
 #else
-    word32 n;
+    sp_uint32 n;
     int i;
 
-    n  = (word32)a[40];
+    n  = (sp_uint32)a[40];
     n  = n >> 24U;
     for (i = 0; i < 40; i += 8) {
-        n += (word32)a[i+41] << 1U; r[i+0] = n & 0x1ffffff; n >>= 25U;
-        n += (word32)a[i+42] << 1U; r[i+1] = n & 0x1ffffff; n >>= 25U;
-        n += (word32)a[i+43] << 1U; r[i+2] = n & 0x1ffffff; n >>= 25U;
-        n += (word32)a[i+44] << 1U; r[i+3] = n & 0x1ffffff; n >>= 25U;
-        n += (word32)a[i+45] << 1U; r[i+4] = n & 0x1ffffff; n >>= 25U;
-        n += (word32)a[i+46] << 1U; r[i+5] = n & 0x1ffffff; n >>= 25U;
-        n += (word32)a[i+47] << 1U; r[i+6] = n & 0x1ffffff; n >>= 25U;
-        n += (word32)a[i+48] << 1U; r[i+7] = n & 0x1ffffff; n >>= 25U;
+        n += (sp_uint32)a[i+41] << 1U; r[i+0] = n & 0x1ffffff; n >>= 25U;
+        n += (sp_uint32)a[i+42] << 1U; r[i+1] = n & 0x1ffffff; n >>= 25U;
+        n += (sp_uint32)a[i+43] << 1U; r[i+2] = n & 0x1ffffff; n >>= 25U;
+        n += (sp_uint32)a[i+44] << 1U; r[i+3] = n & 0x1ffffff; n >>= 25U;
+        n += (sp_uint32)a[i+45] << 1U; r[i+4] = n & 0x1ffffff; n >>= 25U;
+        n += (sp_uint32)a[i+46] << 1U; r[i+5] = n & 0x1ffffff; n >>= 25U;
+        n += (sp_uint32)a[i+47] << 1U; r[i+6] = n & 0x1ffffff; n >>= 25U;
+        n += (sp_uint32)a[i+48] << 1U; r[i+7] = n & 0x1ffffff; n >>= 25U;
     }
-    n += (word32)a[81] << 1U; r[40] = n;
+    n += (sp_uint32)a[81] << 1U; r[40] = n;
 #endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[41], 0, sizeof(*r) * 41U);
 }
@@ -28701,7 +36276,7 @@ static void sp_1024_map_42(sp_point_1024* r, const sp_point_1024* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*42;
-    int32_t n;
+    sp_int32 n;
 
     sp_1024_mont_inv_42(t1, p->z, t + 2*42);
 
@@ -38363,7 +45938,7 @@ static int sp_1024_ecc_is_point_42(const sp_point_1024* point,
     sp_digit t1[42 * 4];
 #endif
     sp_digit* t2 = NULL;
-    int32_t n;
+    sp_int32 n;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c
index 5983b9816..28e06c76f 100644
--- a/wolfcrypt/src/sp_c64.c
+++ b/wolfcrypt/src/sp_c64.c
@@ -52,10 +52,28 @@
 
 #ifndef WOLFSSL_SP_ASM
 #if SP_WORD_SIZE == 64
-#if ((!defined(WC_NO_CACHE_RESISTANT) && \
+#define SP_PRINT_NUM(var, name, total, words, bits)   \
+    do {                                              \
+        int ii;                                       \
+        byte n[bits / 8];                             \
+        sp_digit s[words];                            \
+        XMEMCPY(s, var, sizeof(s));                   \
+        sp_##total##_norm_##words(s);                 \
+        sp_##total##_to_bin_##words(s, n);            \
+        fprintf(stderr, name "=0x");                  \
+        for (ii=0; ii<bits/8; ii++)                   \
+            fprintf(stderr, "%02x", n[ii]);           \
+        fprintf(stderr, "\n");                       \
+    } while (0)
+
+#define SP_PRINT_VAL(var, name) \
+    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)
+
+#if (((!defined(WC_NO_CACHE_RESISTANT) && \
       (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH))) || \
      (defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP))) && \
-    (defined(WOLFSSL_HAVE_SP_ECC) || !defined(WOLFSSL_RSA_PUBLIC_ONLY))
+    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || (defined(WOLFSSL_SP_SMALL) && \
+    defined(WOLFSSL_HAVE_SP_ECC))
 /* Mask for address to obfuscate which of the two address will be used. */
 static const size_t addr_mask[2] = { 0, (size_t)-1 };
 #endif
@@ -66,6 +84,3587 @@ static const size_t addr_mask[2] = { 0, (size_t)-1 };
 
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
+#ifdef WOLFSSL_SP_SMALL
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = n-1; i >= 0; i--) {
+        r[j] |= (((sp_digit)a[i]) << s);
+        if (s >= 53U) {
+            r[j] &= 0x1fffffffffffffffL;
+            s = 61U - s;
+            if (j + 1 >= size) {
+                break;
+            }
+            r[++j] = (sp_digit)a[i] >> s;
+            s = 8U - s;
+        }
+        else {
+            s += 8U;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+}
+
+/* Convert an mp_int to an array of sp_digit.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  A multi-precision integer.
+ */
+static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
+{
+#if DIGIT_BIT == 61
+    int j;
+
+    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);
+
+    for (j = a->used; j < size; j++) {
+        r[j] = 0;
+    }
+#elif DIGIT_BIT > 61
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i] << s);
+        r[j] &= 0x1fffffffffffffffL;
+        s = 61U - s;
+        if (j + 1 >= size) {
+            break;
+        }
+        /* lint allow cast of mismatch word32 and mp_digit */
+        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+        while ((s + 61U) <= (word32)DIGIT_BIT) {
+            s += 61U;
+            r[j] &= 0x1fffffffffffffffL;
+            if (j + 1 >= size) {
+                break;
+            }
+            if (s < (word32)DIGIT_BIT) {
+                /* lint allow cast of mismatch word32 and mp_digit */
+                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+            }
+            else {
+                r[++j] = (sp_digit)0;
+            }
+        }
+        s = (word32)DIGIT_BIT - s;
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#else
+    int i;
+    int j = 0;
+    int s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i]) << s;
+        if (s + DIGIT_BIT >= 61) {
+            r[j] &= 0x1fffffffffffffffL;
+            if (j + 1 >= size) {
+                break;
+            }
+            s = 61 - s;
+            if (s == DIGIT_BIT) {
+                r[++j] = 0;
+                s = 0;
+            }
+            else {
+                r[++j] = a->dp[i] >> s;
+                s = DIGIT_BIT - s;
+            }
+        }
+        else {
+            s += DIGIT_BIT;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#endif
+}
+
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 256
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_2048_to_bin_34(sp_digit* r, byte* a)
+{
+    int i;
+    int j;
+    int s = 0;
+    int b;
+
+    for (i=0; i<33; i++) {
+        r[i+1] += r[i] >> 61;
+        r[i] &= 0x1fffffffffffffffL;
+    }
+    j = 2048 / 8 - 1;
+    a[j] = 0;
+    for (i=0; i<34 && j>=0; i++) {
+        b = 0;
+        /* lint allow cast of mismatch sp_digit and int */
+        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
+        b += 8 - s;
+        if (j < 0) {
+            break;
+        }
+        while (b < 61) {
+            a[j--] = (byte)(r[i] >> b);
+            b += 8;
+            if (j < 0) {
+                break;
+            }
+        }
+        s = 8 - (b - 61);
+        if (j >= 0) {
+            a[j] = 0;
+        }
+        if (s != 0) {
+            j++;
+        }
+    }
+}
+
+/* Normalize the values in each word to 61 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_17(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 16; i++) {
+        a[i+1] += a[i] >> 61;
+        a[i] &= 0x1fffffffffffffffL;
+    }
+}
+
+/* Normalize the values in each word to 61 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_34(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 33; i++) {
+        a[i+1] += a[i] >> 61;
+        a[i] &= 0x1fffffffffffffffL;
+    }
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_2048_mul_34(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 lo;
+
+    c = ((sp_uint128)a[33]) * b[33];
+    r[67] = (sp_digit)(c >> 61);
+    c &= 0x1fffffffffffffffL;
+    for (k = 65; k >= 0; k--) {
+        if (k >= 34) {
+            i = k - 33;
+            imax = 33;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        if (imax - i > 15) {
+            int imaxlo;
+            lo = 0;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
+                for (; i <= imax && i < imaxlo + 15; i++) {
+                    lo += ((sp_uint128)a[i]) * b[k - i];
+                }
+                c += lo >> 61;
+                lo &= 0x1fffffffffffffffL;
+            }
+            r[k + 2] += (sp_digit)(c >> 61);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffffffffffffL);
+            c = lo & 0x1fffffffffffffffL;
+        }
+        else {
+            lo = 0;
+            for (; i <= imax; i++) {
+                lo += ((sp_uint128)a[i]) * b[k - i];
+            }
+            c += lo >> 61;
+            r[k + 2] += (sp_digit)(c >> 61);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffffffffffffL);
+            c = lo & 0x1fffffffffffffffL;
+        }
+    }
+    r[0] = (sp_digit)c;
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_2048_sqr_34(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 t;
+
+    c = ((sp_uint128)a[33]) * a[33];
+    r[67] = (sp_digit)(c >> 61);
+    c = (c & 0x1fffffffffffffffL) << 61;
+    for (k = 65; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
+        }
+        if (k < 33) {
+            imax = k;
+        }
+        else {
+            imax = 33;
+        }
+        if (imax - i >= 14) {
+            int imaxlo;
+            sp_uint128 hi;
+
+            hi = c >> 61;
+            c &= 0x1fffffffffffffffL;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
+                t = 0;
+                for (; i <= imax && i < imaxlo + 14; i++) {
+                    t += ((sp_uint128)a[i]) * a[k - i];
+                }
+                c += t * 2;
+
+                hi += c >> 61;
+                c &= 0x1fffffffffffffffL;
+            }
+            r[k + 2] += (sp_digit)(hi >> 61);
+            r[k + 1]  = (sp_digit)(hi & 0x1fffffffffffffffL);
+            c <<= 61;
+        }
+        else
+        {
+            t = 0;
+            for (; i <= imax; i++) {
+                t += ((sp_uint128)a[i]) * a[k - i];
+            }
+            c += t * 2;
+
+            r[k + 2] += (sp_digit) (c >> 122);
+            r[k + 1]  = (sp_digit)((c >> 61) & 0x1fffffffffffffffL);
+            c = (c & 0x1fffffffffffffffL) << 61;
+        }
+    }
+    r[0] = (sp_digit)(c >> 61);
+}
+
+/* Caclulate the bottom digit of -1/a mod 2^n.
+ *
+ * a    A single precision number.
+ * rho  Bottom word of inverse.
+ */
+static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho)
+{
+    sp_digit x;
+    sp_digit b;
+
+    b = a[0];
+    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+    x &= 0x1fffffffffffffffL;
+
+    /* rho = -1/m mod b */
+    *rho = ((sp_digit)1 << 61) - x;
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_2048_mul_d_34(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 34; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x1fffffffffffffffL);
+        t >>= 61;
+    }
+    r[34] = (sp_digit)t;
+}
+
+#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_sub_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 17; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 2048 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_2048_mont_norm_17(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i=0; i<16; i++) {
+        r[i] = 0x1fffffffffffffffL;
+    }
+    r[16] = 0xffffffffffffL;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_2048_sub_17(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_2048_cmp_17(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i=16; i>=0; i--) {
+        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_2048_cond_sub_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 17; i++) {
+        r[i] = a[i] - (b[i] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_2048_mul_add_17(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t[4];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 16; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffffffffffffL;
+        t[1] += t[0] >> 61;
+        r[i+1] = t[1] & 0x1fffffffffffffffL;
+        t[2] += t[1] >> 61;
+        r[i+2] = t[2] & 0x1fffffffffffffffL;
+        t[3] += t[2] >> 61;
+        r[i+3] = t[3] & 0x1fffffffffffffffL;
+        t[0]  = t[3] >> 61;
+    }
+    t[0] += (tb * a[16]) + r[16];
+    r[16] = t[0] & 0x1fffffffffffffffL;
+    r[17] +=  (sp_digit)(t[0] >> 61);
+}
+
+/* Shift the result in the high 1024 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_2048_mont_shift_17(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int128 n = a[16] >> 48;
+    n += ((sp_int128)a[17]) << 13;
+
+    for (i = 0; i < 16; i++) {
+        r[i] = n & 0x1fffffffffffffffL;
+        n >>= 61;
+        n += ((sp_int128)a[18 + i]) << 13;
+    }
+    r[16] = (sp_digit)n;
+    XMEMSET(&r[17], 0, sizeof(*r) * 17U);
+}
+
+/* Reduce the number back to 2048 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_2048_mont_reduce_17(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_2048_norm_17(a + 17);
+
+    for (i=0; i<16; i++) {
+        mu = (a[i] * mp) & 0x1fffffffffffffffL;
+        sp_2048_mul_add_17(a+i, m, mu);
+        a[i+1] += a[i] >> 61;
+    }
+    mu = (a[i] * mp) & 0xffffffffffffL;
+    sp_2048_mul_add_17(a+i, m, mu);
+    a[i+1] += a[i] >> 61;
+    a[i] &= 0x1fffffffffffffffL;
+    sp_2048_mont_shift_17(a, a);
+    sp_2048_cond_sub_17(a, a, m, 0 - (((a[16] - m[16]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_2048_norm_17(a);
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_2048_mul_17(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 lo;
+
+    c = ((sp_uint128)a[16]) * b[16];
+    r[33] = (sp_digit)(c >> 61);
+    c &= 0x1fffffffffffffffL;
+    for (k = 31; k >= 0; k--) {
+        if (k >= 17) {
+            i = k - 16;
+            imax = 16;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        if (imax - i > 15) {
+            int imaxlo;
+            lo = 0;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
+                for (; i <= imax && i < imaxlo + 15; i++) {
+                    lo += ((sp_uint128)a[i]) * b[k - i];
+                }
+                c += lo >> 61;
+                lo &= 0x1fffffffffffffffL;
+            }
+            r[k + 2] += (sp_digit)(c >> 61);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffffffffffffL);
+            c = lo & 0x1fffffffffffffffL;
+        }
+        else {
+            lo = 0;
+            for (; i <= imax; i++) {
+                lo += ((sp_uint128)a[i]) * b[k - i];
+            }
+            c += lo >> 61;
+            r[k + 2] += (sp_digit)(c >> 61);
+            r[k + 1]  = (sp_digit)(c & 0x1fffffffffffffffL);
+            c = lo & 0x1fffffffffffffffL;
+        }
+    }
+    r[0] = (sp_digit)c;
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_2048_mont_mul_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_2048_mul_17(r, a, b);
+    sp_2048_mont_reduce_17(r, m, mp);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_2048_sqr_17(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 t;
+
+    c = ((sp_uint128)a[16]) * a[16];
+    r[33] = (sp_digit)(c >> 61);
+    c = (c & 0x1fffffffffffffffL) << 61;
+    for (k = 31; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
+        }
+        if (k < 16) {
+            imax = k;
+        }
+        else {
+            imax = 16;
+        }
+        if (imax - i >= 14) {
+            int imaxlo;
+            sp_uint128 hi;
+
+            hi = c >> 61;
+            c &= 0x1fffffffffffffffL;
+            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
+                t = 0;
+                for (; i <= imax && i < imaxlo + 14; i++) {
+                    t += ((sp_uint128)a[i]) * a[k - i];
+                }
+                c += t * 2;
+
+                hi += c >> 61;
+                c &= 0x1fffffffffffffffL;
+            }
+            r[k + 2] += (sp_digit)(hi >> 61);
+            r[k + 1]  = (sp_digit)(hi & 0x1fffffffffffffffL);
+            c <<= 61;
+        }
+        else
+        {
+            t = 0;
+            for (; i <= imax; i++) {
+                t += ((sp_uint128)a[i]) * a[k - i];
+            }
+            c += t * 2;
+
+            r[k + 2] += (sp_digit) (c >> 122);
+            r[k + 1]  = (sp_digit)((c >> 61) & 0x1fffffffffffffffL);
+            c = (c & 0x1fffffffffffffffL) << 61;
+        }
+    }
+    r[0] = (sp_digit)(c >> 61);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_2048_mont_sqr_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_2048_sqr_17(r, a);
+    sp_2048_mont_reduce_17(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_2048_mul_d_17(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 17; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x1fffffffffffffffL);
+        t >>= 61;
+    }
+    r[17] = (sp_digit)t;
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_2048_cond_add_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 17; i++) {
+        r[i] = a[i] + (b[i] & m);
+    }
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_add_17(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 17; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_2048_rshift_17(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<16; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (61 - n))) & 0x1fffffffffffffffL;
+    }
+    r[16] = a[16] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_64
+static WC_INLINE sp_digit sp_2048_div_word_17(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 61 bits from d1 and top 2 bits from d0. */
+    d = (d1 << 2) + (d0 >> 59);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 3 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 57) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 5 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 55) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 53) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 9 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 51) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 11 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 49) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 47) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 15 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 45) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 17 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 43) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 41) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 39) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 23 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 37) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 35) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 27 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 33) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 31) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 31 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 29) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 33 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 27) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 35 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 25) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 37 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 23) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 39 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 21) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 41 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 19) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 43 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 17) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 45 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 15) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 47 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 13) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 49 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 11) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 51 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 9) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 53 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 7) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 55 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 5) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 57 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 3) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 59 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 1) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 61 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 61 bits from d1 and top 2 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_64 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_2048_div_17(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_64
+    sp_int128 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 17 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 17 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 34 + 1;
+        sd = t2 + 17 + 1;
+
+        sp_2048_mul_d_17(sd, d, (sp_digit)1 << 13);
+        sp_2048_mul_d_34(t1, a, (sp_digit)1 << 13);
+        dv = sd[16];
+        t1[17 + 17] += t1[17 + 17 - 1] >> 61;
+        t1[17 + 17 - 1] &= 0x1fffffffffffffffL;
+        for (i=17; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = t1[17 + i];
+            d1 <<= 61;
+            d1 += t1[17 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_2048_div_word_17(t1[17 + i], t1[17 + i - 1], dv);
+#endif
+
+            sp_2048_mul_d_17(t2, sd, r1);
+            (void)sp_2048_sub_17(&t1[i], &t1[i], t2);
+            sp_2048_norm_17(&t1[i]);
+            t1[17 + i] -= t2[17];
+            t1[17 + i] += t1[17 + i - 1] >> 61;
+            t1[17 + i - 1] &= 0x1fffffffffffffffL;
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[17 + i];
+            d1 <<= 61;
+            d1 -= t1[17 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_2048_div_word_17(-t1[17 + i], -t1[17 + i - 1], dv);
+#endif
+            r1 -= t1[17 + i];
+            sp_2048_mul_d_17(t2, sd, r1);
+            (void)sp_2048_add_17(&t1[i], &t1[i], t2);
+            t1[17 + i] += t1[17 + i - 1] >> 61;
+            t1[17 + i - 1] &= 0x1fffffffffffffffL;
+        }
+        t1[17 - 1] += t1[17 - 2] >> 61;
+        t1[17 - 2] &= 0x1fffffffffffffffL;
+        r1 = t1[17 - 1] / dv;
+
+        sp_2048_mul_d_17(t2, sd, r1);
+        sp_2048_sub_17(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 34U);
+        for (i=0; i<16; i++) {
+            r[i+1] += r[i] >> 61;
+            r[i] &= 0x1fffffffffffffffL;
+        }
+        sp_2048_cond_add_17(r, r, sd, 0 - ((r[16] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_2048_norm_17(r);
+        sp_2048_rshift_17(r, r, 13);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_2048_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_2048_div_17(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_2048_mod_exp_17(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 34];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 17 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 17 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 17U * 2U);
+        }
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_17(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_2048_mod_17(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 17U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_2048_mul_17(t[1], t[1], norm);
+        err = sp_2048_mod_17(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 61;
+        c = bits % 61;
+        n = e[i--] << (61 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 61;
+            }
+
+            y = (int)((n >> 60) & 1);
+            n <<= 1;
+
+            sp_2048_mont_mul_17(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 17 * 2);
+            sp_2048_mont_sqr_17(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 17 * 2);
+        }
+
+        sp_2048_mont_reduce_17(t[0], m, mp);
+        n = sp_2048_cmp_17(t[0], m);
+        sp_2048_cond_sub_17(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 17 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 34];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 17 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 17 * 2);
+        }
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_17(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_2048_mod_17(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_2048_mul_17(t[1], t[1], norm);
+                err = sp_2048_mod_17(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_2048_mul_17(t[1], a, norm);
+            err = sp_2048_mod_17(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 61;
+        c = bits % 61;
+        n = e[i--] << (61 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 61;
+            }
+
+            y = (int)((n >> 60) & 1);
+            n <<= 1;
+
+            sp_2048_mont_mul_17(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 17 * 2);
+            sp_2048_mont_sqr_17(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 17 * 2);
+        }
+
+        sp_2048_mont_reduce_17(t[0], m, mp);
+        n = sp_2048_cmp_17(t[0], m);
+        sp_2048_cond_sub_17(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 17 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(32 * 34) + 34];
+#endif
+    sp_digit* t[32];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 34) + 34), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<32; i++)
+            t[i] = td + i * 34;
+        rt = td + 1088;
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_17(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_2048_mod_17(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_2048_mul_17(t[1], t[1], norm);
+                err = sp_2048_mod_17(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_2048_mul_17(t[1], a, norm);
+            err = sp_2048_mod_17(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_mont_sqr_17(t[ 2], t[ 1], m, mp);
+        sp_2048_mont_mul_17(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_2048_mont_sqr_17(t[ 4], t[ 2], m, mp);
+        sp_2048_mont_mul_17(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_2048_mont_sqr_17(t[ 6], t[ 3], m, mp);
+        sp_2048_mont_mul_17(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_2048_mont_sqr_17(t[ 8], t[ 4], m, mp);
+        sp_2048_mont_mul_17(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_2048_mont_sqr_17(t[10], t[ 5], m, mp);
+        sp_2048_mont_mul_17(t[11], t[ 6], t[ 5], m, mp);
+        sp_2048_mont_sqr_17(t[12], t[ 6], m, mp);
+        sp_2048_mont_mul_17(t[13], t[ 7], t[ 6], m, mp);
+        sp_2048_mont_sqr_17(t[14], t[ 7], m, mp);
+        sp_2048_mont_mul_17(t[15], t[ 8], t[ 7], m, mp);
+        sp_2048_mont_sqr_17(t[16], t[ 8], m, mp);
+        sp_2048_mont_mul_17(t[17], t[ 9], t[ 8], m, mp);
+        sp_2048_mont_sqr_17(t[18], t[ 9], m, mp);
+        sp_2048_mont_mul_17(t[19], t[10], t[ 9], m, mp);
+        sp_2048_mont_sqr_17(t[20], t[10], m, mp);
+        sp_2048_mont_mul_17(t[21], t[11], t[10], m, mp);
+        sp_2048_mont_sqr_17(t[22], t[11], m, mp);
+        sp_2048_mont_mul_17(t[23], t[12], t[11], m, mp);
+        sp_2048_mont_sqr_17(t[24], t[12], m, mp);
+        sp_2048_mont_mul_17(t[25], t[13], t[12], m, mp);
+        sp_2048_mont_sqr_17(t[26], t[13], m, mp);
+        sp_2048_mont_mul_17(t[27], t[14], t[13], m, mp);
+        sp_2048_mont_sqr_17(t[28], t[14], m, mp);
+        sp_2048_mont_mul_17(t[29], t[15], t[14], m, mp);
+        sp_2048_mont_sqr_17(t[30], t[15], m, mp);
+        sp_2048_mont_mul_17(t[31], t[16], t[15], m, mp);
+
+        bits = ((bits + 4) / 5) * 5;
+        i = ((bits + 60) / 61) - 1;
+        c = bits % 61;
+        if (c == 0) {
+            c = 61;
+        }
+        if (i < 17) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 5) {
+            n |= e[i--] << (3 - c);
+            c += 61;
+        }
+        y = (int)((n >> 59) & 0x1f);
+        n <<= 5;
+        c -= 5;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 34);
+        while ((i >= 0) || (c >= 5)) {
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 56;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 3;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 61 - c;
+            }
+
+            sp_2048_mont_sqr_17(rt, rt, m, mp);
+            sp_2048_mont_sqr_17(rt, rt, m, mp);
+            sp_2048_mont_sqr_17(rt, rt, m, mp);
+            sp_2048_mont_sqr_17(rt, rt, m, mp);
+            sp_2048_mont_sqr_17(rt, rt, m, mp);
+
+            sp_2048_mont_mul_17(rt, rt, t[y], m, mp);
+        }
+
+        sp_2048_mont_reduce_17(rt, m, mp);
+        n = sp_2048_cmp_17(rt, m);
+        sp_2048_cond_sub_17(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 34);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_sub_34(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 34; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 2048 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_2048_mont_norm_34(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i=0; i<33; i++) {
+        r[i] = 0x1fffffffffffffffL;
+    }
+    r[33] = 0x7ffffffffL;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_2048_sub_34(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_2048_cmp_34(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i=33; i>=0; i--) {
+        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_2048_cond_sub_34(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 34; i++) {
+        r[i] = a[i] - (b[i] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_2048_mul_add_34(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t[4];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 32; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1fffffffffffffffL;
+        t[1] += t[0] >> 61;
+        r[i+1] = t[1] & 0x1fffffffffffffffL;
+        t[2] += t[1] >> 61;
+        r[i+2] = t[2] & 0x1fffffffffffffffL;
+        t[3] += t[2] >> 61;
+        r[i+3] = t[3] & 0x1fffffffffffffffL;
+        t[0]  = t[3] >> 61;
+    }
+    t[0] += (tb * a[32]) + r[32];
+    t[1]  = (tb * a[33]) + r[33];
+    r[32] = t[0] & 0x1fffffffffffffffL;
+    t[1] += t[0] >> 61;
+    r[33] = t[1] & 0x1fffffffffffffffL;
+    r[34] +=  (sp_digit)(t[1] >> 61);
+}
+
+/* Shift the result in the high 2048 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_2048_mont_shift_34(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int128 n = a[33] >> 35;
+    n += ((sp_int128)a[34]) << 26;
+
+    for (i = 0; i < 33; i++) {
+        r[i] = n & 0x1fffffffffffffffL;
+        n >>= 61;
+        n += ((sp_int128)a[35 + i]) << 26;
+    }
+    r[33] = (sp_digit)n;
+    XMEMSET(&r[34], 0, sizeof(*r) * 34U);
+}
+
+/* Reduce the number back to 2048 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_2048_mont_reduce_34(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_2048_norm_34(a + 34);
+
+#ifdef WOLFSSL_SP_DH
+    if (mp != 1) {
+        for (i=0; i<33; i++) {
+            mu = (a[i] * mp) & 0x1fffffffffffffffL;
+            sp_2048_mul_add_34(a+i, m, mu);
+            a[i+1] += a[i] >> 61;
+        }
+        mu = (a[i] * mp) & 0x7ffffffffL;
+        sp_2048_mul_add_34(a+i, m, mu);
+        a[i+1] += a[i] >> 61;
+        a[i] &= 0x1fffffffffffffffL;
+    }
+    else {
+        for (i=0; i<33; i++) {
+            mu = a[i] & 0x1fffffffffffffffL;
+            sp_2048_mul_add_34(a+i, m, mu);
+            a[i+1] += a[i] >> 61;
+        }
+        mu = a[i] & 0x7ffffffffL;
+        sp_2048_mul_add_34(a+i, m, mu);
+        a[i+1] += a[i] >> 61;
+        a[i] &= 0x1fffffffffffffffL;
+    }
+#else
+    for (i=0; i<33; i++) {
+        mu = (a[i] * mp) & 0x1fffffffffffffffL;
+        sp_2048_mul_add_34(a+i, m, mu);
+        a[i+1] += a[i] >> 61;
+    }
+    mu = (a[i] * mp) & 0x7ffffffffL;
+    sp_2048_mul_add_34(a+i, m, mu);
+    a[i+1] += a[i] >> 61;
+    a[i] &= 0x1fffffffffffffffL;
+#endif
+    sp_2048_mont_shift_34(a, a);
+    sp_2048_cond_sub_34(a, a, m, 0 - (((a[33] - m[33]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_2048_norm_34(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_2048_mont_mul_34(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_2048_mul_34(r, a, b);
+    sp_2048_mont_reduce_34(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_2048_mont_sqr_34(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_2048_sqr_34(r, a);
+    sp_2048_mont_reduce_34(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_2048_mul_d_68(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 68; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x1fffffffffffffffL);
+        t >>= 61;
+    }
+    r[68] = (sp_digit)t;
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_2048_cond_add_34(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 17; i++) {
+        r[i] = a[i] + (b[i] & m);
+    }
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_add_34(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 34; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_2048_rshift_34(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<33; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (61 - n))) & 0x1fffffffffffffffL;
+    }
+    r[33] = a[33] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_64
+static WC_INLINE sp_digit sp_2048_div_word_34(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 61 bits from d1 and top 2 bits from d0. */
+    d = (d1 << 2) + (d0 >> 59);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 3 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 57) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 5 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 55) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 53) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 9 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 51) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 11 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 49) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 47) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 15 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 45) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 17 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 43) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 41) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 39) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 23 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 37) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 35) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 27 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 33) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 31) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 31 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 29) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 33 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 27) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 35 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 25) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 37 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 23) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 39 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 21) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 41 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 19) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 43 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 17) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 45 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 15) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 47 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 13) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 49 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 11) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 51 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 9) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 53 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 7) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 55 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 5) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 57 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 3) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 59 bits in r */
+    /* Next 2 bits from d0. */
+    r <<= 2;
+    d <<= 2;
+    d += (d0 >> 1) & ((1 << 2) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 61 bits in r */
+    /* Remaining 1 bits from d0. */
+    r <<= 1;
+    d <<= 1;
+    d += d0 & ((1 << 1) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 61 bits from d1 and top 2 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_64 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_2048_div_34(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_64
+    sp_int128 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 34 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 34 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 68 + 1;
+        sd = t2 + 34 + 1;
+
+        sp_2048_mul_d_34(sd, d, (sp_digit)1 << 26);
+        sp_2048_mul_d_68(t1, a, (sp_digit)1 << 26);
+        dv = sd[33];
+        t1[34 + 34] += t1[34 + 34 - 1] >> 61;
+        t1[34 + 34 - 1] &= 0x1fffffffffffffffL;
+        for (i=34; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = t1[34 + i];
+            d1 <<= 61;
+            d1 += t1[34 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_2048_div_word_34(t1[34 + i], t1[34 + i - 1], dv);
+#endif
+
+            sp_2048_mul_d_34(t2, sd, r1);
+            (void)sp_2048_sub_34(&t1[i], &t1[i], t2);
+            sp_2048_norm_34(&t1[i]);
+            t1[34 + i] -= t2[34];
+            t1[34 + i] += t1[34 + i - 1] >> 61;
+            t1[34 + i - 1] &= 0x1fffffffffffffffL;
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[34 + i];
+            d1 <<= 61;
+            d1 -= t1[34 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_2048_div_word_34(-t1[34 + i], -t1[34 + i - 1], dv);
+#endif
+            r1 -= t1[34 + i];
+            sp_2048_mul_d_34(t2, sd, r1);
+            (void)sp_2048_add_34(&t1[i], &t1[i], t2);
+            t1[34 + i] += t1[34 + i - 1] >> 61;
+            t1[34 + i - 1] &= 0x1fffffffffffffffL;
+        }
+        t1[34 - 1] += t1[34 - 2] >> 61;
+        t1[34 - 2] &= 0x1fffffffffffffffL;
+        r1 = t1[34 - 1] / dv;
+
+        sp_2048_mul_d_34(t2, sd, r1);
+        sp_2048_sub_34(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 68U);
+        for (i=0; i<33; i++) {
+            r[i+1] += r[i] >> 61;
+            r[i] &= 0x1fffffffffffffffL;
+        }
+        sp_2048_cond_add_34(r, r, sd, 0 - ((r[33] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_2048_norm_34(r);
+        sp_2048_rshift_34(r, r, 26);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_2048_mod_34(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_2048_div_34(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_2048_mod_exp_34(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 68];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 34 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 34 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 34U * 2U);
+        }
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_34(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_2048_mod_34(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 34U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_2048_mul_34(t[1], t[1], norm);
+        err = sp_2048_mod_34(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 61;
+        c = bits % 61;
+        n = e[i--] << (61 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 61;
+            }
+
+            y = (int)((n >> 60) & 1);
+            n <<= 1;
+
+            sp_2048_mont_mul_34(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 34 * 2);
+            sp_2048_mont_sqr_34(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 34 * 2);
+        }
+
+        sp_2048_mont_reduce_34(t[0], m, mp);
+        n = sp_2048_cmp_34(t[0], m);
+        sp_2048_cond_sub_34(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 34 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 68];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 34 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 34 * 2);
+        }
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_34(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_2048_mod_34(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_2048_mul_34(t[1], t[1], norm);
+                err = sp_2048_mod_34(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_2048_mul_34(t[1], a, norm);
+            err = sp_2048_mod_34(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 61;
+        c = bits % 61;
+        n = e[i--] << (61 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 61;
+            }
+
+            y = (int)((n >> 60) & 1);
+            n <<= 1;
+
+            sp_2048_mont_mul_34(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 34 * 2);
+            sp_2048_mont_sqr_34(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 34 * 2);
+        }
+
+        sp_2048_mont_reduce_34(t[0], m, mp);
+        n = sp_2048_cmp_34(t[0], m);
+        sp_2048_cond_sub_34(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 34 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(16 * 68) + 68];
+#endif
+    sp_digit* t[16];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 68) + 68), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<16; i++)
+            t[i] = td + i * 68;
+        rt = td + 1088;
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_34(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_2048_mod_34(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_2048_mul_34(t[1], t[1], norm);
+                err = sp_2048_mod_34(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_2048_mul_34(t[1], a, norm);
+            err = sp_2048_mod_34(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_mont_sqr_34(t[ 2], t[ 1], m, mp);
+        sp_2048_mont_mul_34(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_2048_mont_sqr_34(t[ 4], t[ 2], m, mp);
+        sp_2048_mont_mul_34(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_2048_mont_sqr_34(t[ 6], t[ 3], m, mp);
+        sp_2048_mont_mul_34(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_2048_mont_sqr_34(t[ 8], t[ 4], m, mp);
+        sp_2048_mont_mul_34(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_2048_mont_sqr_34(t[10], t[ 5], m, mp);
+        sp_2048_mont_mul_34(t[11], t[ 6], t[ 5], m, mp);
+        sp_2048_mont_sqr_34(t[12], t[ 6], m, mp);
+        sp_2048_mont_mul_34(t[13], t[ 7], t[ 6], m, mp);
+        sp_2048_mont_sqr_34(t[14], t[ 7], m, mp);
+        sp_2048_mont_mul_34(t[15], t[ 8], t[ 7], m, mp);
+
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 60) / 61) - 1;
+        c = bits % 61;
+        if (c == 0) {
+            c = 61;
+        }
+        if (i < 34) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 4) {
+            n |= e[i--] << (3 - c);
+            c += 61;
+        }
+        y = (int)((n >> 60) & 0xf);
+        n <<= 4;
+        c -= 4;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 68);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c = 57;
+            }
+            else {
+                y = (byte)((n >> 60) & 0xf);
+                n = e[i--] << 3;
+                c = 4 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 61 - c;
+            }
+
+            sp_2048_mont_sqr_34(rt, rt, m, mp);
+            sp_2048_mont_sqr_34(rt, rt, m, mp);
+            sp_2048_mont_sqr_34(rt, rt, m, mp);
+            sp_2048_mont_sqr_34(rt, rt, m, mp);
+
+            sp_2048_mont_mul_34(rt, rt, t[y], m, mp);
+        }
+
+        sp_2048_mont_reduce_34(rt, m, mp);
+        n = sp_2048_cmp_34(rt, m);
+        sp_2048_cond_sub_34(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 68);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#ifdef WOLFSSL_HAVE_SP_RSA
+/* RSA public key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * em      Public exponent.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 256 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
+    const mp_int* mm, byte* out, word32* outLen)
+{
+#ifdef WOLFSSL_SP_SMALL
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[34 * 5];
+#endif
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit* norm = NULL;
+    sp_digit e[1] = {0};
+    sp_digit mp;
+    int i;
+    int err = MP_OKAY;
+
+    if (*outLen < 256U) {
+        err = MP_TO_E;
+    }
+
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 61) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 256U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 2048) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 34 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        r = a + 34 * 2;
+        m = r + 34 * 2;
+        norm = r;
+
+        sp_2048_from_bin(a, 34, in, inLen);
+#if DIGIT_BIT >= 61
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_from_mp(m, 34, mm);
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_34(norm, m);
+    }
+    if (err == MP_OKAY) {
+        sp_2048_mul_34(a, a, norm);
+        err = sp_2048_mod_34(a, a, m);
+    }
+    if (err == MP_OKAY) {
+        for (i=60; i>=0; i--) {
+            if ((e[0] >> i) != 0) {
+                break;
+            }
+        }
+
+        XMEMCPY(r, a, sizeof(sp_digit) * 34 * 2);
+        for (i--; i>=0; i--) {
+            sp_2048_mont_sqr_34(r, r, m, mp);
+
+            if (((e[0] >> i) & 1) == 1) {
+                sp_2048_mont_mul_34(r, r, a, m, mp);
+            }
+        }
+        sp_2048_mont_reduce_34(r, m, mp);
+        mp = sp_2048_cmp_34(r, m);
+        sp_2048_cond_sub_34(r, r, m, ((mp < 0) ?
+                    (sp_digit)1 : (sp_digit)0)- 1);
+
+        sp_2048_to_bin_34(r, out);
+        *outLen = 256;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[34 * 5];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit e[1] = {0};
+    int err = MP_OKAY;
+
+    if (*outLen < 256U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 61) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 256U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 2048) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 34 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d;
+        r = a + 34 * 2;
+        m = r + 34 * 2;
+
+        sp_2048_from_bin(a, 34, in, inLen);
+#if DIGIT_BIT >= 61
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_2048_from_mp(m, 34, mm);
+
+        if (e[0] == 0x3) {
+            sp_2048_sqr_34(r, a);
+            err = sp_2048_mod_34(r, r, m);
+            if (err == MP_OKAY) {
+                sp_2048_mul_34(r, a, r);
+                err = sp_2048_mod_34(r, r, m);
+            }
+        }
+        else {
+            sp_digit* norm = r;
+            int i;
+            sp_digit mp;
+
+            sp_2048_mont_setup(m, &mp);
+            sp_2048_mont_norm_34(norm, m);
+
+            sp_2048_mul_34(a, a, norm);
+            err = sp_2048_mod_34(a, a, m);
+
+            if (err == MP_OKAY) {
+                for (i=60; i>=0; i--) {
+                    if ((e[0] >> i) != 0) {
+                        break;
+                    }
+                }
+
+                XMEMCPY(r, a, sizeof(sp_digit) * 68U);
+                for (i--; i>=0; i--) {
+                    sp_2048_mont_sqr_34(r, r, m, mp);
+
+                    if (((e[0] >> i) & 1) == 1) {
+                        sp_2048_mont_mul_34(r, r, a, m, mp);
+                    }
+                }
+                sp_2048_mont_reduce_34(r, m, mp);
+                mp = sp_2048_cmp_34(r, m);
+                sp_2048_cond_sub_34(r, r, m, ((mp < 0) ?
+                           (sp_digit)1 : (sp_digit)0) - 1);
+            }
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_to_bin_34(r, out);
+        *outLen = 256;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+#ifndef WOLFSSL_RSA_PUBLIC_ONLY
+#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
+#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
+/* RSA private key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * dm      Private exponent.
+ * pm      First prime.
+ * qm      Second prime.
+ * dpm     First prime's CRT exponent.
+ * dqm     Second prime's CRT exponent.
+ * qim     Inverse of second prime mod p.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 256 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
+    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
+    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
+{
+#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit  d[34 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 256U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 2048) {
+           err = MP_READ_E;
+        }
+        else if (inLen > 256) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 2048) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 34 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 34;
+        m = a + 68;
+        r = a;
+
+        sp_2048_from_bin(a, 34, in, inLen);
+        sp_2048_from_mp(d, 34, dm);
+        sp_2048_from_mp(m, 34, mm);
+        err = sp_2048_mod_exp_34(r, a, d, 2048, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_to_bin_34(r, out);
+        *outLen = 256;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 34);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[34 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 256U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 2048) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 256U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 2048) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 34 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 34;
+        m = a + 68;
+        r = a;
+
+        sp_2048_from_bin(a, 34, in, inLen);
+        sp_2048_from_mp(d, 34, dm);
+        sp_2048_from_mp(m, 34, mm);
+        err = sp_2048_mod_exp_34(r, a, d, 2048, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_to_bin_34(r, out);
+        *outLen = 256;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 34);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#else
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[17 * 8];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 256U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 256) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 2048) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 17 * 8, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+    if (err == MP_OKAY) {
+        p = a + 34;
+        qi = dq = dp = p + 17;
+        tmpa = qi + 17;
+        tmpb = tmpa + 34;
+        r = a;
+
+        sp_2048_from_bin(a, 34, in, inLen);
+        sp_2048_from_mp(p, 17, pm);
+        sp_2048_from_mp(dp, 17, dpm);
+        err = sp_2048_mod_exp_17(tmpa, a, dp, 1024, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_2048_from_mp(p, 17, qm);
+        sp_2048_from_mp(dq, 17, dqm);
+        err = sp_2048_mod_exp_17(tmpb, a, dq, 1024, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_2048_from_mp(p, 17, pm);
+        (void)sp_2048_sub_17(tmpa, tmpa, tmpb);
+        sp_2048_norm_17(tmpa);
+        sp_2048_cond_add_17(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[16] >> 63));
+        sp_2048_cond_add_17(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[16] >> 63));
+
+        sp_2048_from_mp(qi, 17, qim);
+        sp_2048_mul_17(tmpa, tmpa, qi);
+        err = sp_2048_mod_17(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_from_mp(p, 17, qm);
+        sp_2048_mul_17(tmpa, p, tmpa);
+        (void)sp_2048_add_34(r, tmpb, tmpa);
+        sp_2048_norm_34(r);
+
+        sp_2048_to_bin_34(r, out);
+        *outLen = 256;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 17 * 8);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[17 * 13];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* q = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 256U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 256U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 2048) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 17 * 13, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        p = a + 34 * 2;
+        q = p + 17;
+        dp = q + 17;
+        dq = dp + 17;
+        qi = dq + 17;
+        tmpa = qi + 17;
+        tmpb = tmpa + 34;
+        r = a;
+
+        sp_2048_from_bin(a, 34, in, inLen);
+        sp_2048_from_mp(p, 17, pm);
+        sp_2048_from_mp(q, 17, qm);
+        sp_2048_from_mp(dp, 17, dpm);
+        sp_2048_from_mp(dq, 17, dqm);
+        sp_2048_from_mp(qi, 17, qim);
+
+        err = sp_2048_mod_exp_17(tmpa, a, dp, 1024, p, 1);
+    }
+    if (err == MP_OKAY) {
+        err = sp_2048_mod_exp_17(tmpb, a, dq, 1024, q, 1);
+    }
+
+    if (err == MP_OKAY) {
+        (void)sp_2048_sub_17(tmpa, tmpa, tmpb);
+        sp_2048_norm_17(tmpa);
+        sp_2048_cond_add_17(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[16] >> 63));
+        sp_2048_cond_add_17(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[16] >> 63));
+        sp_2048_mul_17(tmpa, tmpa, qi);
+        err = sp_2048_mod_17(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_mul_17(tmpa, tmpa, q);
+        (void)sp_2048_add_34(r, tmpb, tmpa);
+        sp_2048_norm_34(r);
+
+        sp_2048_to_bin_34(r, out);
+        *outLen = 256;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 17 * 13);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+    #endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
+}
+
+#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
+#endif /* WOLFSSL_HAVE_SP_RSA */
+#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
+                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
+/* Convert an array of sp_digit to an mp_int.
+ *
+ * a  A single precision integer.
+ * r  A multi-precision integer.
+ */
+static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
+{
+    int err;
+
+    err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
+    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
+#if DIGIT_BIT == 61
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 34);
+        r->used = 34;
+        mp_clamp(r);
+#elif DIGIT_BIT < 61
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 34; i++) {
+            r->dp[j] |= (mp_digit)(a[i] << s);
+            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+            s = DIGIT_BIT - s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
+            while (s + DIGIT_BIT <= 61) {
+                s += DIGIT_BIT;
+                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+                if (s == SP_WORD_SIZE) {
+                    r->dp[j] = 0;
+                }
+                else {
+                    r->dp[j] = (mp_digit)(a[i] >> s);
+                }
+            }
+            s = 61 - s;
+        }
+        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#else
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 34; i++) {
+            r->dp[j] |= ((mp_digit)a[i]) << s;
+            if (s + 61 >= DIGIT_BIT) {
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
+                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+    #endif
+                s = DIGIT_BIT - s;
+                r->dp[++j] = a[i] >> s;
+                s = 61 - s;
+            }
+            else {
+                s += 61;
+            }
+        }
+        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#endif
+    }
+
+    return err;
+}
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base  Base. MP integer.
+ * exp   Exponent. MP integer.
+ * mod   Modulus. MP integer.
+ * res   Result. MP integer.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
+    mp_int* res)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int err = MP_OKAY;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[34 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 2048) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 2048) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 2048) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 34 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 34 * 2;
+        m = e + 34;
+        r = b;
+
+        sp_2048_from_mp(b, 34, base);
+        sp_2048_from_mp(e, 34, exp);
+        sp_2048_from_mp(m, 34, mod);
+
+        err = sp_2048_mod_exp_34(r, b, e, mp_count_bits(exp), m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_2048_to_mp(r, res);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 34U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[34 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 2048) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 2048) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 2048) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 34 * 4, NULL, DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 34 * 2;
+        m = e + 34;
+        r = b;
+
+        sp_2048_from_mp(b, 34, base);
+        sp_2048_from_mp(e, 34, exp);
+        sp_2048_from_mp(m, 34, mod);
+
+        err = sp_2048_mod_exp_34(r, b, e, expBits, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_2048_to_mp(r, res);
+    }
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 34U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+#endif
+}
+
+#ifdef WOLFSSL_HAVE_SP_DH
+
+#ifdef HAVE_FFDHE_2048
+SP_NOINLINE static void sp_2048_lshift_34(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    r[34] = a[33] >> (61 - n);
+    for (i=33; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (61 - n))) & 0x1fffffffffffffffL;
+    }
+    r[0] = (a[0] << n) & 0x1fffffffffffffffL;
+}
+
+/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_2048_mod_exp_2_34(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[103];
+#endif
+    sp_digit* norm = NULL;
+    sp_digit* tmp = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit o;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 103, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        tmp  = td + 68;
+        XMEMSET(td, 0, sizeof(sp_digit) * 103);
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_34(norm, m);
+
+        bits = ((bits + 4) / 5) * 5;
+        i = ((bits + 60) / 61) - 1;
+        c = bits % 61;
+        if (c == 0) {
+            c = 61;
+        }
+        if (i < 34) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 5) {
+            n |= e[i--] << (3 - c);
+            c += 61;
+        }
+        y = (int)((n >> 59) & 0x1f);
+        n <<= 5;
+        c -= 5;
+        sp_2048_lshift_34(r, norm, (byte)y);
+        while ((i >= 0) || (c >= 5)) {
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 3;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 56;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 3;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 61 - c;
+            }
+
+            sp_2048_mont_sqr_34(r, r, m, mp);
+            sp_2048_mont_sqr_34(r, r, m, mp);
+            sp_2048_mont_sqr_34(r, r, m, mp);
+            sp_2048_mont_sqr_34(r, r, m, mp);
+            sp_2048_mont_sqr_34(r, r, m, mp);
+
+            sp_2048_lshift_34(r, r, (byte)y);
+            sp_2048_mul_d_34(tmp, norm, (r[34] << 26) + (r[33] >> 35));
+            r[34] = 0;
+            r[33] &= 0x7ffffffffL;
+            (void)sp_2048_add_34(r, r, tmp);
+            sp_2048_norm_34(r);
+            o = sp_2048_cmp_34(r, m);
+            sp_2048_cond_sub_34(r, r, m, ((o < 0) ?
+                                          (sp_digit)1 : (sp_digit)0) - 1);
+        }
+
+        sp_2048_mont_reduce_34(r, m, mp);
+        n = sp_2048_cmp_34(r, m);
+        sp_2048_cond_sub_34(r, r, m, ((n < 0) ?
+                                                (sp_digit)1 : (sp_digit)0) - 1);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+#endif /* HAVE_FFDHE_2048 */
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base     Base.
+ * exp      Array of bytes that is the exponent.
+ * expLen   Length of data, in bytes, in exponent.
+ * mod      Modulus.
+ * out      Buffer to hold big-endian bytes of exponentiation result.
+ *          Must be at least 256 bytes long.
+ * outLen   Length, in bytes, of exponentiation result.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
+    const mp_int* mod, byte* out, word32* outLen)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[34 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    word32 i;
+    int err = MP_OKAY;
+
+    if (mp_count_bits(base) > 2048) {
+        err = MP_READ_E;
+    }
+    else if (expLen > 256U) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 2048) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 34 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 34 * 2;
+        m = e + 34;
+        r = b;
+
+        sp_2048_from_mp(b, 34, base);
+        sp_2048_from_bin(e, 34, exp, expLen);
+        sp_2048_from_mp(m, 34, mod);
+
+    #ifdef HAVE_FFDHE_2048
+        if (base->used == 1 && base->dp[0] == 2U &&
+                (m[33] >> 3) == 0xffffffffL) {
+            err = sp_2048_mod_exp_2_34(r, e, expLen * 8U, m);
+        }
+        else {
+    #endif
+            err = sp_2048_mod_exp_34(r, b, e, expLen * 8U, m, 0);
+    #ifdef HAVE_FFDHE_2048
+        }
+    #endif
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_to_bin_34(r, out);
+        *outLen = 256;
+        for (i=0; i<256U && out[i] == 0U; i++) {
+            /* Search for first non-zero. */
+        }
+        *outLen -= i;
+        XMEMMOVE(out, out + i, *outLen);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 34U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+}
+#endif /* WOLFSSL_HAVE_SP_DH */
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base  Base. MP integer.
+ * exp   Exponent. MP integer.
+ * mod   Modulus. MP integer.
+ * res   Result. MP integer.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
+    mp_int* res)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int err = MP_OKAY;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[17 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 1024) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 1024) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 1024) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 17 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 17 * 2;
+        m = e + 17;
+        r = b;
+
+        sp_2048_from_mp(b, 17, base);
+        sp_2048_from_mp(e, 17, exp);
+        sp_2048_from_mp(m, 17, mod);
+
+        err = sp_2048_mod_exp_17(r, b, e, mp_count_bits(exp), m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        XMEMSET(r + 17, 0, sizeof(*r) * 17U);
+        err = sp_2048_to_mp(r, res);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 34U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[17 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 1024) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 1024) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 1024) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 17 * 4, NULL, DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 17 * 2;
+        m = e + 17;
+        r = b;
+
+        sp_2048_from_mp(b, 17, base);
+        sp_2048_from_mp(e, 17, exp);
+        sp_2048_from_mp(m, 17, mod);
+
+        err = sp_2048_mod_exp_17(r, b, e, expBits, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        XMEMSET(r + 17, 0, sizeof(*r) * 17U);
+        err = sp_2048_to_mp(r, res);
+    }
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 34U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+#endif
+}
+
+#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
+
+#else
 /* Read big endian unsigned byte array into r.
  *
  * r  A single precision integer.
@@ -192,7 +3791,7 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_2048_to_bin(sp_digit* r, byte* a)
+static void sp_2048_to_bin_36(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -230,6 +3829,48 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 57 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_18(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 16; i += 8) {
+        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffL;
+        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffL;
+        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffL;
+        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffL;
+        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffL;
+        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffL;
+        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
+        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
+    }
+    a[17] += a[16] >> 57; a[16] &= 0x1ffffffffffffffL;
+}
+
+/* Normalize the values in each word to 57 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_36(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 32; i += 8) {
+        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffL;
+        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffL;
+        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffL;
+        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffL;
+        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffL;
+        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffL;
+        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
+        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
+    }
+    a[33] += a[32] >> 57; a[32] &= 0x1ffffffffffffffL;
+    a[34] += a[33] >> 57; a[33] &= 0x1ffffffffffffffL;
+    a[35] += a[34] >> 57; a[34] &= 0x1ffffffffffffffL;
+}
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -240,87 +3881,87 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
 SP_NOINLINE static void sp_2048_mul_9(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
-    int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
-                 + ((int128_t)a[ 1]) * b[ 0];
-    int128_t t2   = ((int128_t)a[ 0]) * b[ 2]
-                 + ((int128_t)a[ 1]) * b[ 1]
-                 + ((int128_t)a[ 2]) * b[ 0];
-    int128_t t3   = ((int128_t)a[ 0]) * b[ 3]
-                 + ((int128_t)a[ 1]) * b[ 2]
-                 + ((int128_t)a[ 2]) * b[ 1]
-                 + ((int128_t)a[ 3]) * b[ 0];
-    int128_t t4   = ((int128_t)a[ 0]) * b[ 4]
-                 + ((int128_t)a[ 1]) * b[ 3]
-                 + ((int128_t)a[ 2]) * b[ 2]
-                 + ((int128_t)a[ 3]) * b[ 1]
-                 + ((int128_t)a[ 4]) * b[ 0];
-    int128_t t5   = ((int128_t)a[ 0]) * b[ 5]
-                 + ((int128_t)a[ 1]) * b[ 4]
-                 + ((int128_t)a[ 2]) * b[ 3]
-                 + ((int128_t)a[ 3]) * b[ 2]
-                 + ((int128_t)a[ 4]) * b[ 1]
-                 + ((int128_t)a[ 5]) * b[ 0];
-    int128_t t6   = ((int128_t)a[ 0]) * b[ 6]
-                 + ((int128_t)a[ 1]) * b[ 5]
-                 + ((int128_t)a[ 2]) * b[ 4]
-                 + ((int128_t)a[ 3]) * b[ 3]
-                 + ((int128_t)a[ 4]) * b[ 2]
-                 + ((int128_t)a[ 5]) * b[ 1]
-                 + ((int128_t)a[ 6]) * b[ 0];
-    int128_t t7   = ((int128_t)a[ 0]) * b[ 7]
-                 + ((int128_t)a[ 1]) * b[ 6]
-                 + ((int128_t)a[ 2]) * b[ 5]
-                 + ((int128_t)a[ 3]) * b[ 4]
-                 + ((int128_t)a[ 4]) * b[ 3]
-                 + ((int128_t)a[ 5]) * b[ 2]
-                 + ((int128_t)a[ 6]) * b[ 1]
-                 + ((int128_t)a[ 7]) * b[ 0];
-    int128_t t8   = ((int128_t)a[ 0]) * b[ 8]
-                 + ((int128_t)a[ 1]) * b[ 7]
-                 + ((int128_t)a[ 2]) * b[ 6]
-                 + ((int128_t)a[ 3]) * b[ 5]
-                 + ((int128_t)a[ 4]) * b[ 4]
-                 + ((int128_t)a[ 5]) * b[ 3]
-                 + ((int128_t)a[ 6]) * b[ 2]
-                 + ((int128_t)a[ 7]) * b[ 1]
-                 + ((int128_t)a[ 8]) * b[ 0];
-    int128_t t9   = ((int128_t)a[ 1]) * b[ 8]
-                 + ((int128_t)a[ 2]) * b[ 7]
-                 + ((int128_t)a[ 3]) * b[ 6]
-                 + ((int128_t)a[ 4]) * b[ 5]
-                 + ((int128_t)a[ 5]) * b[ 4]
-                 + ((int128_t)a[ 6]) * b[ 3]
-                 + ((int128_t)a[ 7]) * b[ 2]
-                 + ((int128_t)a[ 8]) * b[ 1];
-    int128_t t10  = ((int128_t)a[ 2]) * b[ 8]
-                 + ((int128_t)a[ 3]) * b[ 7]
-                 + ((int128_t)a[ 4]) * b[ 6]
-                 + ((int128_t)a[ 5]) * b[ 5]
-                 + ((int128_t)a[ 6]) * b[ 4]
-                 + ((int128_t)a[ 7]) * b[ 3]
-                 + ((int128_t)a[ 8]) * b[ 2];
-    int128_t t11  = ((int128_t)a[ 3]) * b[ 8]
-                 + ((int128_t)a[ 4]) * b[ 7]
-                 + ((int128_t)a[ 5]) * b[ 6]
-                 + ((int128_t)a[ 6]) * b[ 5]
-                 + ((int128_t)a[ 7]) * b[ 4]
-                 + ((int128_t)a[ 8]) * b[ 3];
-    int128_t t12  = ((int128_t)a[ 4]) * b[ 8]
-                 + ((int128_t)a[ 5]) * b[ 7]
-                 + ((int128_t)a[ 6]) * b[ 6]
-                 + ((int128_t)a[ 7]) * b[ 5]
-                 + ((int128_t)a[ 8]) * b[ 4];
-    int128_t t13  = ((int128_t)a[ 5]) * b[ 8]
-                 + ((int128_t)a[ 6]) * b[ 7]
-                 + ((int128_t)a[ 7]) * b[ 6]
-                 + ((int128_t)a[ 8]) * b[ 5];
-    int128_t t14  = ((int128_t)a[ 6]) * b[ 8]
-                 + ((int128_t)a[ 7]) * b[ 7]
-                 + ((int128_t)a[ 8]) * b[ 6];
-    int128_t t15  = ((int128_t)a[ 7]) * b[ 8]
-                 + ((int128_t)a[ 8]) * b[ 7];
-    int128_t t16  = ((int128_t)a[ 8]) * b[ 8];
+    sp_uint128 t0   = ((sp_uint128)a[ 0]) * b[ 0];
+    sp_uint128 t1   = ((sp_uint128)a[ 0]) * b[ 1]
+                 + ((sp_uint128)a[ 1]) * b[ 0];
+    sp_uint128 t2   = ((sp_uint128)a[ 0]) * b[ 2]
+                 + ((sp_uint128)a[ 1]) * b[ 1]
+                 + ((sp_uint128)a[ 2]) * b[ 0];
+    sp_uint128 t3   = ((sp_uint128)a[ 0]) * b[ 3]
+                 + ((sp_uint128)a[ 1]) * b[ 2]
+                 + ((sp_uint128)a[ 2]) * b[ 1]
+                 + ((sp_uint128)a[ 3]) * b[ 0];
+    sp_uint128 t4   = ((sp_uint128)a[ 0]) * b[ 4]
+                 + ((sp_uint128)a[ 1]) * b[ 3]
+                 + ((sp_uint128)a[ 2]) * b[ 2]
+                 + ((sp_uint128)a[ 3]) * b[ 1]
+                 + ((sp_uint128)a[ 4]) * b[ 0];
+    sp_uint128 t5   = ((sp_uint128)a[ 0]) * b[ 5]
+                 + ((sp_uint128)a[ 1]) * b[ 4]
+                 + ((sp_uint128)a[ 2]) * b[ 3]
+                 + ((sp_uint128)a[ 3]) * b[ 2]
+                 + ((sp_uint128)a[ 4]) * b[ 1]
+                 + ((sp_uint128)a[ 5]) * b[ 0];
+    sp_uint128 t6   = ((sp_uint128)a[ 0]) * b[ 6]
+                 + ((sp_uint128)a[ 1]) * b[ 5]
+                 + ((sp_uint128)a[ 2]) * b[ 4]
+                 + ((sp_uint128)a[ 3]) * b[ 3]
+                 + ((sp_uint128)a[ 4]) * b[ 2]
+                 + ((sp_uint128)a[ 5]) * b[ 1]
+                 + ((sp_uint128)a[ 6]) * b[ 0];
+    sp_uint128 t7   = ((sp_uint128)a[ 0]) * b[ 7]
+                 + ((sp_uint128)a[ 1]) * b[ 6]
+                 + ((sp_uint128)a[ 2]) * b[ 5]
+                 + ((sp_uint128)a[ 3]) * b[ 4]
+                 + ((sp_uint128)a[ 4]) * b[ 3]
+                 + ((sp_uint128)a[ 5]) * b[ 2]
+                 + ((sp_uint128)a[ 6]) * b[ 1]
+                 + ((sp_uint128)a[ 7]) * b[ 0];
+    sp_uint128 t8   = ((sp_uint128)a[ 0]) * b[ 8]
+                 + ((sp_uint128)a[ 1]) * b[ 7]
+                 + ((sp_uint128)a[ 2]) * b[ 6]
+                 + ((sp_uint128)a[ 3]) * b[ 5]
+                 + ((sp_uint128)a[ 4]) * b[ 4]
+                 + ((sp_uint128)a[ 5]) * b[ 3]
+                 + ((sp_uint128)a[ 6]) * b[ 2]
+                 + ((sp_uint128)a[ 7]) * b[ 1]
+                 + ((sp_uint128)a[ 8]) * b[ 0];
+    sp_uint128 t9   = ((sp_uint128)a[ 1]) * b[ 8]
+                 + ((sp_uint128)a[ 2]) * b[ 7]
+                 + ((sp_uint128)a[ 3]) * b[ 6]
+                 + ((sp_uint128)a[ 4]) * b[ 5]
+                 + ((sp_uint128)a[ 5]) * b[ 4]
+                 + ((sp_uint128)a[ 6]) * b[ 3]
+                 + ((sp_uint128)a[ 7]) * b[ 2]
+                 + ((sp_uint128)a[ 8]) * b[ 1];
+    sp_uint128 t10  = ((sp_uint128)a[ 2]) * b[ 8]
+                 + ((sp_uint128)a[ 3]) * b[ 7]
+                 + ((sp_uint128)a[ 4]) * b[ 6]
+                 + ((sp_uint128)a[ 5]) * b[ 5]
+                 + ((sp_uint128)a[ 6]) * b[ 4]
+                 + ((sp_uint128)a[ 7]) * b[ 3]
+                 + ((sp_uint128)a[ 8]) * b[ 2];
+    sp_uint128 t11  = ((sp_uint128)a[ 3]) * b[ 8]
+                 + ((sp_uint128)a[ 4]) * b[ 7]
+                 + ((sp_uint128)a[ 5]) * b[ 6]
+                 + ((sp_uint128)a[ 6]) * b[ 5]
+                 + ((sp_uint128)a[ 7]) * b[ 4]
+                 + ((sp_uint128)a[ 8]) * b[ 3];
+    sp_uint128 t12  = ((sp_uint128)a[ 4]) * b[ 8]
+                 + ((sp_uint128)a[ 5]) * b[ 7]
+                 + ((sp_uint128)a[ 6]) * b[ 6]
+                 + ((sp_uint128)a[ 7]) * b[ 5]
+                 + ((sp_uint128)a[ 8]) * b[ 4];
+    sp_uint128 t13  = ((sp_uint128)a[ 5]) * b[ 8]
+                 + ((sp_uint128)a[ 6]) * b[ 7]
+                 + ((sp_uint128)a[ 7]) * b[ 6]
+                 + ((sp_uint128)a[ 8]) * b[ 5];
+    sp_uint128 t14  = ((sp_uint128)a[ 6]) * b[ 8]
+                 + ((sp_uint128)a[ 7]) * b[ 7]
+                 + ((sp_uint128)a[ 8]) * b[ 6];
+    sp_uint128 t15  = ((sp_uint128)a[ 7]) * b[ 8]
+                 + ((sp_uint128)a[ 8]) * b[ 7];
+    sp_uint128 t16  = ((sp_uint128)a[ 8]) * b[ 8];
 
     t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffL;
     t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffL;
@@ -349,51 +3990,51 @@ SP_NOINLINE static void sp_2048_mul_9(sp_digit* r, const sp_digit* a,
  */
 SP_NOINLINE static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a)
 {
-    int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
-    int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
-    int128_t t2   = (((int128_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int128_t)a[ 1]) * a[ 1];
-    int128_t t3   = (((int128_t)a[ 0]) * a[ 3]
-                 +  ((int128_t)a[ 1]) * a[ 2]) * 2;
-    int128_t t4   = (((int128_t)a[ 0]) * a[ 4]
-                 +  ((int128_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int128_t)a[ 2]) * a[ 2];
-    int128_t t5   = (((int128_t)a[ 0]) * a[ 5]
-                 +  ((int128_t)a[ 1]) * a[ 4]
-                 +  ((int128_t)a[ 2]) * a[ 3]) * 2;
-    int128_t t6   = (((int128_t)a[ 0]) * a[ 6]
-                 +  ((int128_t)a[ 1]) * a[ 5]
-                 +  ((int128_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int128_t)a[ 3]) * a[ 3];
-    int128_t t7   = (((int128_t)a[ 0]) * a[ 7]
-                 +  ((int128_t)a[ 1]) * a[ 6]
-                 +  ((int128_t)a[ 2]) * a[ 5]
-                 +  ((int128_t)a[ 3]) * a[ 4]) * 2;
-    int128_t t8   = (((int128_t)a[ 0]) * a[ 8]
-                 +  ((int128_t)a[ 1]) * a[ 7]
-                 +  ((int128_t)a[ 2]) * a[ 6]
-                 +  ((int128_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int128_t)a[ 4]) * a[ 4];
-    int128_t t9   = (((int128_t)a[ 1]) * a[ 8]
-                 +  ((int128_t)a[ 2]) * a[ 7]
-                 +  ((int128_t)a[ 3]) * a[ 6]
-                 +  ((int128_t)a[ 4]) * a[ 5]) * 2;
-    int128_t t10  = (((int128_t)a[ 2]) * a[ 8]
-                 +  ((int128_t)a[ 3]) * a[ 7]
-                 +  ((int128_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int128_t)a[ 5]) * a[ 5];
-    int128_t t11  = (((int128_t)a[ 3]) * a[ 8]
-                 +  ((int128_t)a[ 4]) * a[ 7]
-                 +  ((int128_t)a[ 5]) * a[ 6]) * 2;
-    int128_t t12  = (((int128_t)a[ 4]) * a[ 8]
-                 +  ((int128_t)a[ 5]) * a[ 7]) * 2
-                 +  ((int128_t)a[ 6]) * a[ 6];
-    int128_t t13  = (((int128_t)a[ 5]) * a[ 8]
-                 +  ((int128_t)a[ 6]) * a[ 7]) * 2;
-    int128_t t14  = (((int128_t)a[ 6]) * a[ 8]) * 2
-                 +  ((int128_t)a[ 7]) * a[ 7];
-    int128_t t15  = (((int128_t)a[ 7]) * a[ 8]) * 2;
-    int128_t t16  =  ((int128_t)a[ 8]) * a[ 8];
+    sp_uint128 t0   =  ((sp_uint128)a[ 0]) * a[ 0];
+    sp_uint128 t1   = (((sp_uint128)a[ 0]) * a[ 1]) * 2;
+    sp_uint128 t2   = (((sp_uint128)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_uint128)a[ 1]) * a[ 1];
+    sp_uint128 t3   = (((sp_uint128)a[ 0]) * a[ 3]
+                 +  ((sp_uint128)a[ 1]) * a[ 2]) * 2;
+    sp_uint128 t4   = (((sp_uint128)a[ 0]) * a[ 4]
+                 +  ((sp_uint128)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_uint128)a[ 2]) * a[ 2];
+    sp_uint128 t5   = (((sp_uint128)a[ 0]) * a[ 5]
+                 +  ((sp_uint128)a[ 1]) * a[ 4]
+                 +  ((sp_uint128)a[ 2]) * a[ 3]) * 2;
+    sp_uint128 t6   = (((sp_uint128)a[ 0]) * a[ 6]
+                 +  ((sp_uint128)a[ 1]) * a[ 5]
+                 +  ((sp_uint128)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_uint128)a[ 3]) * a[ 3];
+    sp_uint128 t7   = (((sp_uint128)a[ 0]) * a[ 7]
+                 +  ((sp_uint128)a[ 1]) * a[ 6]
+                 +  ((sp_uint128)a[ 2]) * a[ 5]
+                 +  ((sp_uint128)a[ 3]) * a[ 4]) * 2;
+    sp_uint128 t8   = (((sp_uint128)a[ 0]) * a[ 8]
+                 +  ((sp_uint128)a[ 1]) * a[ 7]
+                 +  ((sp_uint128)a[ 2]) * a[ 6]
+                 +  ((sp_uint128)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_uint128)a[ 4]) * a[ 4];
+    sp_uint128 t9   = (((sp_uint128)a[ 1]) * a[ 8]
+                 +  ((sp_uint128)a[ 2]) * a[ 7]
+                 +  ((sp_uint128)a[ 3]) * a[ 6]
+                 +  ((sp_uint128)a[ 4]) * a[ 5]) * 2;
+    sp_uint128 t10  = (((sp_uint128)a[ 2]) * a[ 8]
+                 +  ((sp_uint128)a[ 3]) * a[ 7]
+                 +  ((sp_uint128)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_uint128)a[ 5]) * a[ 5];
+    sp_uint128 t11  = (((sp_uint128)a[ 3]) * a[ 8]
+                 +  ((sp_uint128)a[ 4]) * a[ 7]
+                 +  ((sp_uint128)a[ 5]) * a[ 6]) * 2;
+    sp_uint128 t12  = (((sp_uint128)a[ 4]) * a[ 8]
+                 +  ((sp_uint128)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_uint128)a[ 6]) * a[ 6];
+    sp_uint128 t13  = (((sp_uint128)a[ 5]) * a[ 8]
+                 +  ((sp_uint128)a[ 6]) * a[ 7]) * 2;
+    sp_uint128 t14  = (((sp_uint128)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_uint128)a[ 7]) * a[ 7];
+    sp_uint128 t15  = (((sp_uint128)a[ 7]) * a[ 8]) * 2;
+    sp_uint128 t16  =  ((sp_uint128)a[ 8]) * a[ 8];
 
     t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffL;
     t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffL;
@@ -638,239 +4279,6 @@ SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
 }
 
 #endif /* !WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 36; i++) {
-        r[i] = a[i] + b[i];
-    }
-
-    return 0;
-}
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 36; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[35]) * b[35];
-    r[71] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
-    for (k = 69; k >= 0; k--) {
-        for (i = 35; i >= 0; i--) {
-            j = k - i;
-            if (j >= 36) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
-        }
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
-    }
-    r[0] = (sp_digit)(c >> 57);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[35]) * a[35];
-    r[71] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
-    for (k = 69; k >= 0; k--) {
-        for (i = 35; i >= 0; i--) {
-            j = k - i;
-            if (j >= 36 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
-        }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
-    }
-    r[0] = (sp_digit)(c >> 57);
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 18; i++) {
-        r[i] = a[i] + b[i];
-    }
-
-    return 0;
-}
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 18; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_2048_mul_18(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[17]) * b[17];
-    r[35] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
-    for (k = 33; k >= 0; k--) {
-        for (i = 17; i >= 0; i--) {
-            j = k - i;
-            if (j >= 18) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
-        }
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
-    }
-    r[0] = (sp_digit)(c >> 57);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[17]) * a[17];
-    r[35] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
-    for (k = 33; k >= 0; k--) {
-        for (i = 17; i >= 0; i--) {
-            j = k - i;
-            if (j >= 18 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
-        }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
-    }
-    r[0] = (sp_digit)(c >> 57);
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
-
 /* Caclulate the bottom digit of -1/a mod 2^n.
  *
  * a    A single precision number.
@@ -902,22 +4310,10 @@ static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho)
 SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 36; i++) {
-        t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1ffffffffffffffL);
-        t >>= 57;
-    }
-    r[36] = (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     sp_digit t2;
-    int128_t p[4];
+    sp_int128 p[4];
     int i;
 
     for (i = 0; i < 36; i += 4) {
@@ -943,7 +4339,6 @@ SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a,
         r[i + 3] = (sp_digit)t2;
     }
     r[36] = (sp_digit)(t & 0x1ffffffffffffffL);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
@@ -956,13 +4351,6 @@ SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a,
 static void sp_2048_mont_norm_18(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=0; i<17; i++) {
-        r[i] = 0x1ffffffffffffffL;
-    }
-#else
     int i;
 
     for (i = 0; i < 16; i += 8) {
@@ -976,7 +4364,6 @@ static void sp_2048_mont_norm_18(sp_digit* r, const sp_digit* m)
         r[i + 7] = 0x1ffffffffffffffL;
     }
     r[16] = 0x1ffffffffffffffL;
-#endif
     r[17] = 0x7fffffffffffffL;
 
     /* r = (2^n - 1) mod n */
@@ -996,13 +4383,6 @@ static void sp_2048_mont_norm_18(sp_digit* r, const sp_digit* m)
 static sp_digit sp_2048_cmp_18(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=17; i>=0; i--) {
-        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#else
     int i;
 
     r |= (a[17] - b[17]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
@@ -1017,7 +4397,6 @@ static sp_digit sp_2048_cmp_18(const sp_digit* a, const sp_digit* b)
         r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -1033,13 +4412,6 @@ static sp_digit sp_2048_cmp_18(const sp_digit* a, const sp_digit* b)
 static void sp_2048_cond_sub_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 18; i++) {
-        r[i] = a[i] - (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 16; i += 8) {
@@ -1054,7 +4426,6 @@ static void sp_2048_cond_sub_18(sp_digit* r, const sp_digit* a,
     }
     r[16] = a[16] - (b[16] & m);
     r[17] = a[17] - (b[17] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -1066,20 +4437,8 @@ static void sp_2048_cond_sub_18(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_2048_mul_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 18; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1ffffffffffffffL;
-        t >>= 57;
-    }
-    r[18] += (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t[8];
+    sp_int128 tb = b;
+    sp_int128 t[8];
     int i;
 
     t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1ffffffffffffffL);
@@ -1104,35 +4463,6 @@ SP_NOINLINE static void sp_2048_mul_add_18(sp_digit* r, const sp_digit* a,
     t[1] = tb * a[17];
     r[17] += (sp_digit)((t[0] >> 57) + (t[1] & 0x1ffffffffffffffL));
     r[18] +=  (sp_digit)(t[1] >> 57);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 57.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_2048_norm_18(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 17; i++) {
-        a[i+1] += a[i] >> 57;
-        a[i] &= 0x1ffffffffffffffL;
-    }
-#else
-    int i;
-    for (i = 0; i < 16; i += 8) {
-        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffL;
-        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffL;
-        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffL;
-        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffL;
-        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffL;
-        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffL;
-        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
-        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
-    }
-    a[16+1] += a[16] >> 57; a[16] &= 0x1ffffffffffffffL;
-#endif
 }
 
 /* Shift the result in the high 1024 bits down to the bottom.
@@ -1142,37 +4472,23 @@ static void sp_2048_norm_18(sp_digit* a)
  */
 static void sp_2048_mont_shift_18(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    word64 n;
-
-    n = a[17] >> 55;
-    for (i = 0; i < 17; i++) {
-        n += (word64)a[18 + i] << 2;
-        r[i] = n & 0x1ffffffffffffffL;
-        n >>= 57;
-    }
-    n += (word64)a[35] << 2;
-    r[17] = n;
-#else
-    word64 n;
+    sp_uint64 n;
     int i;
 
-    n  = (word64)a[17];
+    n  = (sp_uint64)a[17];
     n  = n >> 55U;
     for (i = 0; i < 16; i += 8) {
-        n += (word64)a[i+18] << 2U; r[i+0] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+19] << 2U; r[i+1] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+20] << 2U; r[i+2] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+21] << 2U; r[i+3] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+22] << 2U; r[i+4] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+23] << 2U; r[i+5] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+24] << 2U; r[i+6] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+25] << 2U; r[i+7] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+18] << 2U; r[i+0] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+19] << 2U; r[i+1] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+20] << 2U; r[i+2] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+21] << 2U; r[i+3] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+22] << 2U; r[i+4] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+23] << 2U; r[i+5] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+24] << 2U; r[i+6] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+25] << 2U; r[i+7] = n & 0x1ffffffffffffffUL; n >>= 57U;
     }
-    n += (word64)a[34] << 2U; r[16] = n & 0x1ffffffffffffffUL; n >>= 57U;
-    n += (word64)a[35] << 2U; r[17] = n;
-#endif /* WOLFSSL_SP_SMALL */
+    n += (sp_uint64)a[34] << 2U; r[16] = n & 0x1ffffffffffffffUL; n >>= 57U;
+    n += (sp_uint64)a[35] << 2U; r[17] = n;
     XMEMSET(&r[18], 0, sizeof(*r) * 18U);
 }
 
@@ -1199,7 +4515,7 @@ static void sp_2048_mont_reduce_18(sp_digit* a, const sp_digit* m, sp_digit mp)
     a[i+1] += a[i] >> 57;
     a[i] &= 0x1ffffffffffffffL;
     sp_2048_mont_shift_18(a, a);
-    sp_2048_cond_sub_18(a, a, m, 0 - (((a[17] >> 55) > 0) ?
+    sp_2048_cond_sub_18(a, a, m, 0 - (((a[17] - m[17]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
     sp_2048_norm_18(a);
 }
@@ -1243,22 +4559,10 @@ static void sp_2048_mont_sqr_18(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 18; i++) {
-        t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1ffffffffffffffL);
-        t >>= 57;
-    }
-    r[18] = (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     sp_digit t2;
-    int128_t p[4];
+    sp_int128 p[4];
     int i;
 
     for (i = 0; i < 16; i += 4) {
@@ -1290,7 +4594,6 @@ SP_NOINLINE static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a,
     r[17] = (sp_digit)(t & 0x1ffffffffffffffL);
     t >>= 57;
     r[18] = (sp_digit)(t & 0x1ffffffffffffffL);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Conditionally add a and b using the mask m.
@@ -1304,13 +4607,6 @@ SP_NOINLINE static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a,
 static void sp_2048_cond_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 18; i++) {
-        r[i] = a[i] + (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 16; i += 8) {
@@ -1325,7 +4621,25 @@ static void sp_2048_cond_add_18(sp_digit* r, const sp_digit* a,
     }
     r[16] = a[16] + (b[16] & m);
     r[17] = a[17] + (b[17] & m);
-#endif /* WOLFSSL_SP_SMALL */
+}
+
+SP_NOINLINE static void sp_2048_rshift_18(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<16; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (57 - n)) & 0x1ffffffffffffffL);
+    }
+    r[16] = (a[16] >> n) | ((a[17] << (57 - n)) & 0x1ffffffffffffffL);
+    r[17] = a[17] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_64
@@ -1420,7 +4734,7 @@ static WC_INLINE sp_digit sp_2048_div_word_18(sp_digit d1, sp_digit d0,
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Full implementation.
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -1428,40 +4742,45 @@ static WC_INLINE sp_digit sp_2048_div_word_18(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_2048_div_18(const sp_digit* a, const sp_digit* d, 
+static int sp_2048_div_18(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
+    sp_int128 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 18 + 1];
+    sp_digit t1[4 * 18 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 18 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 18 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
-    if (err == MP_OKAY) {
-        t2 = t1 + 2 * 18;
+    (void)m;
 
-        dv = d[17];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 18U);
-        for (i=17; i>=0; i--) {
-            t1[18 + i] += t1[18 + i - 1] >> 57;
-            t1[18 + i - 1] &= 0x1ffffffffffffffL;
+    if (err == MP_OKAY) {
+        t2 = t1 + 36 + 1;
+        sd = t2 + 18 + 1;
+
+        sp_2048_mul_d_18(sd, d, (sp_digit)1 << 2);
+        sp_2048_mul_d_36(t1, a, (sp_digit)1 << 2);
+        dv = sd[17];
+        t1[18 + 18] += t1[18 + 18 - 1] >> 57;
+        t1[18 + 18 - 1] &= 0x1ffffffffffffffL;
+        for (i=18; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_64
             d1 = t1[18 + i];
             d1 <<= 57;
@@ -1471,15 +4790,22 @@ static int sp_2048_div_18(const sp_digit* a, const sp_digit* d,
             r1 = sp_2048_div_word_18(t1[18 + i], t1[18 + i - 1], dv);
 #endif
 
-            sp_2048_mul_d_18(t2, d, r1);
+            sp_2048_mul_d_18(t2, sd, r1);
             (void)sp_2048_sub_18(&t1[i], &t1[i], t2);
             sp_2048_norm_18(&t1[i]);
             t1[18 + i] -= t2[18];
             t1[18 + i] += t1[18 + i - 1] >> 57;
             t1[18 + i - 1] &= 0x1ffffffffffffffL;
-            r1 = (((-t1[18 + i]) << 57) - t1[18 + i - 1]) / dv;
-            r1++;
-            sp_2048_mul_d_18(t2, d, r1);
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[18 + i];
+            d1 <<= 57;
+            d1 -= t1[18 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_2048_div_word_18(-t1[18 + i], -t1[18 + i - 1], dv);
+#endif
+            r1 -= t1[18 + i];
+            sp_2048_mul_d_18(t2, sd, r1);
             (void)sp_2048_add_18(&t1[i], &t1[i], t2);
             t1[18 + i] += t1[18 + i - 1] >> 57;
             t1[18 + i - 1] &= 0x1ffffffffffffffL;
@@ -1488,15 +4814,18 @@ static int sp_2048_div_18(const sp_digit* a, const sp_digit* d,
         t1[18 - 2] &= 0x1ffffffffffffffL;
         r1 = t1[18 - 1] / dv;
 
-        sp_2048_mul_d_18(t2, d, r1);
-        (void)sp_2048_sub_18(t1, t1, t2);
+        sp_2048_mul_d_18(t2, sd, r1);
+        sp_2048_sub_18(t1, t1, t2);
         XMEMCPY(r, t1, sizeof(*r) * 36U);
         for (i=0; i<17; i++) {
             r[i+1] += r[i] >> 57;
             r[i] &= 0x1ffffffffffffffL;
         }
-        sp_2048_cond_add_18(r, r, d, 0 - ((r[17] < 0) ?
+        sp_2048_cond_add_18(r, r, sd, 0 - ((r[17] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
+
+        sp_2048_norm_18(r);
+        sp_2048_rshift_18(r, r, 2);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -1801,13 +5130,25 @@ static int sp_2048_mod_exp_18(sp_digit* r, const sp_digit* a, const sp_digit* e,
         c -= 5;
         XMEMCPY(rt, t[y], sizeof(sp_digit) * 36);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (7 - c);
-                c += 57;
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 7;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 52;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 7;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 57 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_2048_mont_sqr_18(rt, rt, m, mp);
             sp_2048_mont_sqr_18(rt, rt, m, mp);
@@ -1845,13 +5186,6 @@ static int sp_2048_mod_exp_18(sp_digit* r, const sp_digit* a, const sp_digit* e,
 static void sp_2048_mont_norm_36(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=0; i<35; i++) {
-        r[i] = 0x1ffffffffffffffL;
-    }
-#else
     int i;
 
     for (i = 0; i < 32; i += 8) {
@@ -1867,7 +5201,6 @@ static void sp_2048_mont_norm_36(sp_digit* r, const sp_digit* m)
     r[32] = 0x1ffffffffffffffL;
     r[33] = 0x1ffffffffffffffL;
     r[34] = 0x1ffffffffffffffL;
-#endif
     r[35] = 0x1fffffffffffffL;
 
     /* r = (2^n - 1) mod n */
@@ -1887,13 +5220,6 @@ static void sp_2048_mont_norm_36(sp_digit* r, const sp_digit* m)
 static sp_digit sp_2048_cmp_36(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=35; i>=0; i--) {
-        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#else
     int i;
 
     r |= (a[35] - b[35]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
@@ -1910,7 +5236,6 @@ static sp_digit sp_2048_cmp_36(const sp_digit* a, const sp_digit* b)
         r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -1926,13 +5251,6 @@ static sp_digit sp_2048_cmp_36(const sp_digit* a, const sp_digit* b)
 static void sp_2048_cond_sub_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 36; i++) {
-        r[i] = a[i] - (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 32; i += 8) {
@@ -1949,7 +5267,6 @@ static void sp_2048_cond_sub_36(sp_digit* r, const sp_digit* a,
     r[33] = a[33] - (b[33] & m);
     r[34] = a[34] - (b[34] & m);
     r[35] = a[35] - (b[35] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -1961,20 +5278,8 @@ static void sp_2048_cond_sub_36(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_2048_mul_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 36; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1ffffffffffffffL;
-        t >>= 57;
-    }
-    r[36] += (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t[8];
+    sp_int128 tb = b;
+    sp_int128 t[8];
     int i;
 
     t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1ffffffffffffffL);
@@ -2003,37 +5308,6 @@ SP_NOINLINE static void sp_2048_mul_add_36(sp_digit* r, const sp_digit* a,
     t[3] = tb * a[35];
     r[35] += (sp_digit)((t[2] >> 57) + (t[3] & 0x1ffffffffffffffL));
     r[36] +=  (sp_digit)(t[3] >> 57);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 57.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_2048_norm_36(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 35; i++) {
-        a[i+1] += a[i] >> 57;
-        a[i] &= 0x1ffffffffffffffL;
-    }
-#else
-    int i;
-    for (i = 0; i < 32; i += 8) {
-        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffL;
-        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffL;
-        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffL;
-        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffL;
-        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffL;
-        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffL;
-        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
-        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
-    }
-    a[32+1] += a[32] >> 57; a[32] &= 0x1ffffffffffffffL;
-    a[33+1] += a[33] >> 57; a[33] &= 0x1ffffffffffffffL;
-    a[34+1] += a[34] >> 57; a[34] &= 0x1ffffffffffffffL;
-#endif
 }
 
 /* Shift the result in the high 2048 bits down to the bottom.
@@ -2043,22 +5317,6 @@ static void sp_2048_norm_36(sp_digit* a)
  */
 static void sp_2048_mont_shift_36(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    sp_digit n;
-    sp_digit s;
-
-    s = a[36];
-    n = a[35] >> 53;
-    for (i = 0; i < 35; i++) {
-        n += (s & 0x1ffffffffffffffL) << 4;
-        r[i] = n & 0x1ffffffffffffffL;
-        n >>= 57;
-        s = a[37 + i] + (s >> 57);
-    }
-    n += s << 4;
-    r[35] = n;
-#else
     sp_digit n;
     sp_digit s;
     int i;
@@ -2089,7 +5347,6 @@ static void sp_2048_mont_shift_36(sp_digit* r, const sp_digit* a)
     n += (s & 0x1ffffffffffffffL) << 4; r[34] = n & 0x1ffffffffffffffL;
     n >>= 57; s = a[71] + (s >> 57);
     n += s << 4;              r[35] = n;
-#endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[36], 0, sizeof(*r) * 36U);
 }
 
@@ -2141,7 +5398,7 @@ static void sp_2048_mont_reduce_36(sp_digit* a, const sp_digit* m, sp_digit mp)
     a[i] &= 0x1ffffffffffffffL;
 #endif
     sp_2048_mont_shift_36(a, a);
-    sp_2048_cond_sub_36(a, a, m, 0 - (((a[35] >> 53) > 0) ?
+    sp_2048_cond_sub_36(a, a, m, 0 - (((a[35] - m[35]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
     sp_2048_norm_36(a);
 }
@@ -2176,6 +5433,46 @@ static void sp_2048_mont_sqr_36(sp_digit* r, const sp_digit* a,
     sp_2048_mont_reduce_36(r, m, mp);
 }
 
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_2048_mul_d_72(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    sp_digit t2;
+    sp_int128 p[4];
+    int i;
+
+    for (i = 0; i < 72; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 3] = (sp_digit)t2;
+    }
+    r[72] = (sp_digit)(t & 0x1ffffffffffffffL);
+}
+
 /* Conditionally add a and b using the mask m.
  * m is -1 to add and 0 when not.
  *
@@ -2187,13 +5484,6 @@ static void sp_2048_mont_sqr_36(sp_digit* r, const sp_digit* a,
 static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 36; i++) {
-        r[i] = a[i] + (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 32; i += 8) {
@@ -2210,7 +5500,27 @@ static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a,
     r[33] = a[33] + (b[33] & m);
     r[34] = a[34] + (b[34] & m);
     r[35] = a[35] + (b[35] & m);
-#endif /* WOLFSSL_SP_SMALL */
+}
+
+SP_NOINLINE static void sp_2048_rshift_36(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<32; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (57 - n)) & 0x1ffffffffffffffL);
+    }
+    r[32] = (a[32] >> n) | ((a[33] << (57 - n)) & 0x1ffffffffffffffL);
+    r[33] = (a[33] >> n) | ((a[34] << (57 - n)) & 0x1ffffffffffffffL);
+    r[34] = (a[34] >> n) | ((a[35] << (57 - n)) & 0x1ffffffffffffffL);
+    r[35] = a[35] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_64
@@ -2305,7 +5615,7 @@ static WC_INLINE sp_digit sp_2048_div_word_36(sp_digit d1, sp_digit d0,
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Full implementation.
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -2313,40 +5623,45 @@ static WC_INLINE sp_digit sp_2048_div_word_36(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_2048_div_36(const sp_digit* a, const sp_digit* d, 
+static int sp_2048_div_36(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
+    sp_int128 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 36 + 1];
+    sp_digit t1[4 * 36 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 36 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 36 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
-    if (err == MP_OKAY) {
-        t2 = t1 + 2 * 36;
+    (void)m;
 
-        dv = d[35];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 36U);
-        for (i=35; i>=0; i--) {
-            t1[36 + i] += t1[36 + i - 1] >> 57;
-            t1[36 + i - 1] &= 0x1ffffffffffffffL;
+    if (err == MP_OKAY) {
+        t2 = t1 + 72 + 1;
+        sd = t2 + 36 + 1;
+
+        sp_2048_mul_d_36(sd, d, (sp_digit)1 << 4);
+        sp_2048_mul_d_72(t1, a, (sp_digit)1 << 4);
+        dv = sd[35];
+        t1[36 + 36] += t1[36 + 36 - 1] >> 57;
+        t1[36 + 36 - 1] &= 0x1ffffffffffffffL;
+        for (i=36; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_64
             d1 = t1[36 + i];
             d1 <<= 57;
@@ -2356,15 +5671,22 @@ static int sp_2048_div_36(const sp_digit* a, const sp_digit* d,
             r1 = sp_2048_div_word_36(t1[36 + i], t1[36 + i - 1], dv);
 #endif
 
-            sp_2048_mul_d_36(t2, d, r1);
+            sp_2048_mul_d_36(t2, sd, r1);
             (void)sp_2048_sub_36(&t1[i], &t1[i], t2);
             sp_2048_norm_36(&t1[i]);
             t1[36 + i] -= t2[36];
             t1[36 + i] += t1[36 + i - 1] >> 57;
             t1[36 + i - 1] &= 0x1ffffffffffffffL;
-            r1 = (((-t1[36 + i]) << 57) - t1[36 + i - 1]) / dv;
-            r1++;
-            sp_2048_mul_d_36(t2, d, r1);
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[36 + i];
+            d1 <<= 57;
+            d1 -= t1[36 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_2048_div_word_36(-t1[36 + i], -t1[36 + i - 1], dv);
+#endif
+            r1 -= t1[36 + i];
+            sp_2048_mul_d_36(t2, sd, r1);
             (void)sp_2048_add_36(&t1[i], &t1[i], t2);
             t1[36 + i] += t1[36 + i - 1] >> 57;
             t1[36 + i - 1] &= 0x1ffffffffffffffL;
@@ -2373,15 +5695,18 @@ static int sp_2048_div_36(const sp_digit* a, const sp_digit* d,
         t1[36 - 2] &= 0x1ffffffffffffffL;
         r1 = t1[36 - 1] / dv;
 
-        sp_2048_mul_d_36(t2, d, r1);
-        (void)sp_2048_sub_36(t1, t1, t2);
+        sp_2048_mul_d_36(t2, sd, r1);
+        sp_2048_sub_36(t1, t1, t2);
         XMEMCPY(r, t1, sizeof(*r) * 72U);
         for (i=0; i<35; i++) {
             r[i+1] += r[i] >> 57;
             r[i] &= 0x1ffffffffffffffL;
         }
-        sp_2048_cond_add_36(r, r, d, 0 - ((r[35] < 0) ?
+        sp_2048_cond_add_36(r, r, sd, 0 - ((r[35] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
+
+        sp_2048_norm_36(r);
+        sp_2048_rshift_36(r, r, 4);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -2593,9 +5918,9 @@ static int sp_2048_mod_exp_36(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[(32 * 72) + 72];
+    sp_digit td[(16 * 72) + 72];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm = NULL;
     sp_digit mp = 1;
@@ -2606,7 +5931,7 @@ static int sp_2048_mod_exp_36(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 72) + 72), NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 72) + 72), NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -2614,9 +5939,9 @@ static int sp_2048_mod_exp_36(sp_digit* r, const sp_digit* a, const sp_digit* e,
 
     if (err == MP_OKAY) {
         norm = td;
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 72;
-        rt = td + 2304;
+        rt = td + 1152;
 
         sp_2048_mont_setup(m, &mp);
         sp_2048_mont_norm_36(norm, m);
@@ -2649,24 +5974,8 @@ static int sp_2048_mod_exp_36(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_2048_mont_mul_36(t[13], t[ 7], t[ 6], m, mp);
         sp_2048_mont_sqr_36(t[14], t[ 7], m, mp);
         sp_2048_mont_mul_36(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_36(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_36(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_36(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_36(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_36(t[20], t[10], m, mp);
-        sp_2048_mont_mul_36(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_36(t[22], t[11], m, mp);
-        sp_2048_mont_mul_36(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_36(t[24], t[12], m, mp);
-        sp_2048_mont_mul_36(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_36(t[26], t[13], m, mp);
-        sp_2048_mont_mul_36(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_36(t[28], t[14], m, mp);
-        sp_2048_mont_mul_36(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_36(t[30], t[15], m, mp);
-        sp_2048_mont_mul_36(t[31], t[16], t[15], m, mp);
 
-        bits = ((bits + 4) / 5) * 5;
+        bits = ((bits + 3) / 4) * 4;
         i = ((bits + 56) / 57) - 1;
         c = bits % 57;
         if (c == 0) {
@@ -2679,28 +5988,39 @@ static int sp_2048_mod_exp_36(sp_digit* r, const sp_digit* a, const sp_digit* e,
             n = 0;
             i--;
         }
-        if (c < 5) {
+        if (c < 4) {
             n |= e[i--] << (7 - c);
             c += 57;
         }
-        y = (int)((n >> 59) & 0x1f);
-        n <<= 5;
-        c -= 5;
+        y = (int)((n >> 60) & 0xf);
+        n <<= 4;
+        c -= 4;
         XMEMCPY(rt, t[y], sizeof(sp_digit) * 72);
-        while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (7 - c);
-                c += 57;
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 7;
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c = 53;
+            }
+            else {
+                y = (byte)((n >> 60) & 0xf);
+                n = e[i--] << 7;
+                c = 4 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 57 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_2048_mont_sqr_36(rt, rt, m, mp);
             sp_2048_mont_sqr_36(rt, rt, m, mp);
             sp_2048_mont_sqr_36(rt, rt, m, mp);
             sp_2048_mont_sqr_36(rt, rt, m, mp);
-            sp_2048_mont_sqr_36(rt, rt, m, mp);
 
             sp_2048_mont_mul_36(rt, rt, t[y], m, mp);
         }
@@ -2830,7 +6150,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
         sp_2048_cond_sub_36(r, r, m, ((mp < 0) ?
                     (sp_digit)1 : (sp_digit)0)- 1);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_36(r, out);
         *outLen = 256;
     }
 
@@ -2943,7 +6263,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_36(r, out);
         *outLen = 256;
     }
 
@@ -3037,7 +6357,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_36(r, out);
         *outLen = 256;
     }
 
@@ -3110,7 +6430,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_36(r, out);
         *outLen = 256;
     }
 
@@ -3133,10 +6453,9 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[18 * 11];
+    sp_digit a[18 * 8];
 #endif
     sp_digit* p = NULL;
-    sp_digit* q = NULL;
     sp_digit* dp = NULL;
     sp_digit* dq = NULL;
     sp_digit* qi = NULL;
@@ -3165,31 +6484,31 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 11, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 8, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
     }
 #endif
     if (err == MP_OKAY) {
-        p = a + 36 * 2;
-        q = p + 18;
-        qi = dq = dp = q + 18;
+        p = a + 36;
+        qi = dq = dp = p + 18;
         tmpa = qi + 18;
         tmpb = tmpa + 36;
-        r = a + 36;
+        r = a;
 
         sp_2048_from_bin(a, 36, in, inLen);
         sp_2048_from_mp(p, 18, pm);
-        sp_2048_from_mp(q, 18, qm);
         sp_2048_from_mp(dp, 18, dpm);
         err = sp_2048_mod_exp_18(tmpa, a, dp, 1024, p, 1);
     }
     if (err == MP_OKAY) {
+        sp_2048_from_mp(p, 18, qm);
         sp_2048_from_mp(dq, 18, dqm);
-        err = sp_2048_mod_exp_18(tmpb, a, dq, 1024, q, 1);
+        err = sp_2048_mod_exp_18(tmpb, a, dq, 1024, p, 1);
     }
     if (err == MP_OKAY) {
+        sp_2048_from_mp(p, 18, pm);
         (void)sp_2048_sub_18(tmpa, tmpa, tmpb);
         sp_2048_norm_18(tmpa);
         sp_2048_cond_add_18(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[17] >> 63));
@@ -3201,11 +6520,12 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_mul_18(tmpa, q, tmpa);
+        sp_2048_from_mp(p, 18, qm);
+        sp_2048_mul_18(tmpa, p, tmpa);
         (void)sp_2048_add_36(r, tmpb, tmpa);
         sp_2048_norm_36(r);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_36(r, out);
         *outLen = 256;
     }
 
@@ -3213,7 +6533,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 18 * 11);
+        ForceZero(a, sizeof(sp_digit) * 18 * 8);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -3256,7 +6576,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 13, NULL, 
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 13, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -3300,7 +6620,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
         (void)sp_2048_add_36(r, tmpb, tmpa);
         sp_2048_norm_36(r);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_36(r, out);
         *outLen = 256;
     }
 
@@ -3537,14 +6857,6 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
 SP_NOINLINE static void sp_2048_lshift_36(sp_digit* r, const sp_digit* a,
         byte n)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    r[36] = a[35] >> (57 - n);
-    for (i=35; i>0; i--) {
-        r[i] = ((a[i] << n) | (a[i-1] >> (57 - n))) & 0x1ffffffffffffffL;
-    }
-#else
     sp_int_digit s;
     sp_int_digit t;
 
@@ -3620,7 +6932,6 @@ SP_NOINLINE static void sp_2048_lshift_36(sp_digit* r, const sp_digit* a,
     r[2] = ((s << n) | (t >> (57U - n))) & 0x1ffffffffffffffUL;
     s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
     r[1] = ((s << n) | (t >> (57U - n))) & 0x1ffffffffffffffUL;
-#endif
     r[0] = (a[0] << n) & 0x1ffffffffffffffL;
 }
 
@@ -3686,13 +6997,25 @@ static int sp_2048_mod_exp_2_36(sp_digit* r, const sp_digit* e, int bits, const
         c -= 5;
         sp_2048_lshift_36(r, norm, (byte)y);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (7 - c);
-                c += 57;
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 7;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 52;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 7;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 57 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_2048_mont_sqr_36(r, r, m, mp);
             sp_2048_mont_sqr_36(r, r, m, mp);
@@ -3742,80 +7065,6 @@ static int sp_2048_mod_exp_2_36(sp_digit* r, const sp_digit* e, int bits, const
 int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     const mp_int* mod, byte* out, word32* outLen)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int err = MP_OKAY;
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* b = NULL;
-#else
-    sp_digit b[36 * 4];
-#endif
-    sp_digit* e = NULL;
-    sp_digit* m = NULL;
-    sp_digit* r = NULL;
-    word32 i;
-
-    if (mp_count_bits(base) > 2048) {
-        err = MP_READ_E;
-    }
-    else if (expLen > 256) {
-        err = MP_READ_E;
-    }
-    else if (mp_count_bits(mod) != 2048) {
-        err = MP_READ_E;
-    }
-    else if (mp_iseven(mod)) {
-        err = MP_VAL;
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 4, NULL, DYNAMIC_TYPE_DH);
-        if (b == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        e = b + 36 * 2;
-        m = e + 36;
-        r = b;
-
-        sp_2048_from_mp(b, 36, base);
-        sp_2048_from_bin(e, 36, exp, expLen);
-        sp_2048_from_mp(m, 36, mod);
-
-    #ifdef HAVE_FFDHE_2048
-        if (base->used == 1 && base->dp[0] == 2 &&
-                (m[35] >> 21) == 0xffffffffL) {
-            err = sp_2048_mod_exp_2_36(r, e, expLen * 8, m);
-        }
-        else
-    #endif
-            err = sp_2048_mod_exp_36(r, b, e, expLen * 8, m, 0);
-    }
-
-    if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
-        *outLen = 256;
-        for (i=0; i<256 && out[i] == 0; i++) {
-        }
-        *outLen -= i;
-        XMEMMOVE(out, out + i, *outLen);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (b != NULL)
-#endif
-    {
-        /* only "e" is sensitive and needs zeroized */
-        if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 36U);
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        XFREE(b, NULL, DYNAMIC_TYPE_DH);
-    #endif
-    }
-    return err;
-#else
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
@@ -3872,7 +7121,7 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_36(r, out);
         *outLen = 256;
         for (i=0; i<256U && out[i] == 0U; i++) {
             /* Search for first non-zero. */
@@ -3894,7 +7143,6 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     return err;
-#endif
 }
 #endif /* WOLFSSL_HAVE_SP_DH */
 
@@ -4042,9 +7290,3346 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
 
+#endif /* WOLFSSL_SP_SMALL */
 #endif /* !WOLFSSL_SP_NO_2048 */
 
 #ifndef WOLFSSL_SP_NO_3072
+#ifdef WOLFSSL_SP_SMALL
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = n-1; i >= 0; i--) {
+        r[j] |= (((sp_digit)a[i]) << s);
+        if (s >= 52U) {
+            r[j] &= 0xfffffffffffffffL;
+            s = 60U - s;
+            if (j + 1 >= size) {
+                break;
+            }
+            r[++j] = (sp_digit)a[i] >> s;
+            s = 8U - s;
+        }
+        else {
+            s += 8U;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+}
+
+/* Convert an mp_int to an array of sp_digit.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  A multi-precision integer.
+ */
+static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
+{
+#if DIGIT_BIT == 60
+    int j;
+
+    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);
+
+    for (j = a->used; j < size; j++) {
+        r[j] = 0;
+    }
+#elif DIGIT_BIT > 60
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i] << s);
+        r[j] &= 0xfffffffffffffffL;
+        s = 60U - s;
+        if (j + 1 >= size) {
+            break;
+        }
+        /* lint allow cast of mismatch word32 and mp_digit */
+        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+        while ((s + 60U) <= (word32)DIGIT_BIT) {
+            s += 60U;
+            r[j] &= 0xfffffffffffffffL;
+            if (j + 1 >= size) {
+                break;
+            }
+            if (s < (word32)DIGIT_BIT) {
+                /* lint allow cast of mismatch word32 and mp_digit */
+                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+            }
+            else {
+                r[++j] = (sp_digit)0;
+            }
+        }
+        s = (word32)DIGIT_BIT - s;
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#else
+    int i;
+    int j = 0;
+    int s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i]) << s;
+        if (s + DIGIT_BIT >= 60) {
+            r[j] &= 0xfffffffffffffffL;
+            if (j + 1 >= size) {
+                break;
+            }
+            s = 60 - s;
+            if (s == DIGIT_BIT) {
+                r[++j] = 0;
+                s = 0;
+            }
+            else {
+                r[++j] = a->dp[i] >> s;
+                s = DIGIT_BIT - s;
+            }
+        }
+        else {
+            s += DIGIT_BIT;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#endif
+}
+
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 384
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_3072_to_bin_52(sp_digit* r, byte* a)
+{
+    int i;
+    int j;
+    int s = 0;
+    int b;
+
+    for (i=0; i<51; i++) {
+        r[i+1] += r[i] >> 60;
+        r[i] &= 0xfffffffffffffffL;
+    }
+    j = 3072 / 8 - 1;
+    a[j] = 0;
+    for (i=0; i<52 && j>=0; i++) {
+        b = 0;
+        /* lint allow cast of mismatch sp_digit and int */
+        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
+        b += 8 - s;
+        if (j < 0) {
+            break;
+        }
+        while (b < 60) {
+            a[j--] = (byte)(r[i] >> b);
+            b += 8;
+            if (j < 0) {
+                break;
+            }
+        }
+        s = 8 - (b - 60);
+        if (j >= 0) {
+            a[j] = 0;
+        }
+        if (s != 0) {
+            j++;
+        }
+    }
+}
+
+/* Normalize the values in each word to 60 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_26(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 25; i++) {
+        a[i+1] += a[i] >> 60;
+        a[i] &= 0xfffffffffffffffL;
+    }
+}
+
+/* Normalize the values in each word to 60 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_52(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 51; i++) {
+        a[i+1] += a[i] >> 60;
+        a[i] &= 0xfffffffffffffffL;
+    }
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_mul_52(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 lo;
+
+    c = ((sp_uint128)a[51]) * b[51];
+    r[103] = (sp_digit)(c >> 60);
+    c &= 0xfffffffffffffffL;
+    for (k = 101; k >= 0; k--) {
+        if (k >= 52) {
+            i = k - 51;
+            imax = 51;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint128)a[i]) * b[k - i];
+        }
+        c += lo >> 60;
+        r[k + 2] += (sp_digit)(c >> 60);
+        r[k + 1]  = (sp_digit)(c & 0xfffffffffffffffL);
+        c = lo & 0xfffffffffffffffL;
+    }
+    r[0] = (sp_digit)c;
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_sqr_52(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 t;
+
+    c = ((sp_uint128)a[51]) * a[51];
+    r[103] = (sp_digit)(c >> 60);
+    c = (c & 0xfffffffffffffffL) << 60;
+    for (k = 101; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
+        }
+        if (k < 51) {
+            imax = k;
+        }
+        else {
+            imax = 51;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint128)a[i]) * a[k - i];
+        }
+        c += t * 2;
+
+        r[k + 2] += (sp_digit) (c >> 120);
+        r[k + 1]  = (sp_digit)((c >> 60) & 0xfffffffffffffffL);
+        c = (c & 0xfffffffffffffffL) << 60;
+    }
+    r[0] = (sp_digit)(c >> 60);
+}
+
+/* Caclulate the bottom digit of -1/a mod 2^n.
+ *
+ * a    A single precision number.
+ * rho  Bottom word of inverse.
+ */
+static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
+{
+    sp_digit x;
+    sp_digit b;
+
+    b = a[0];
+    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+    x &= 0xfffffffffffffffL;
+
+    /* rho = -1/m mod b */
+    *rho = ((sp_digit)1 << 60) - x;
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_d_52(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 52; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0xfffffffffffffffL);
+        t >>= 60;
+    }
+    r[52] = (sp_digit)t;
+}
+
+#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_26(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 26; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 3072 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_3072_mont_norm_26(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i=0; i<25; i++) {
+        r[i] = 0xfffffffffffffffL;
+    }
+    r[25] = 0xfffffffffL;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_3072_sub_26(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_3072_cmp_26(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i=25; i>=0; i--) {
+        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_sub_26(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 26; i++) {
+        r[i] = a[i] - (b[i] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_add_26(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t[4];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 24; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0xfffffffffffffffL;
+        t[1] += t[0] >> 60;
+        r[i+1] = t[1] & 0xfffffffffffffffL;
+        t[2] += t[1] >> 60;
+        r[i+2] = t[2] & 0xfffffffffffffffL;
+        t[3] += t[2] >> 60;
+        r[i+3] = t[3] & 0xfffffffffffffffL;
+        t[0]  = t[3] >> 60;
+    }
+    t[0] += (tb * a[24]) + r[24];
+    t[1]  = (tb * a[25]) + r[25];
+    r[24] = t[0] & 0xfffffffffffffffL;
+    t[1] += t[0] >> 60;
+    r[25] = t[1] & 0xfffffffffffffffL;
+    r[26] +=  (sp_digit)(t[1] >> 60);
+}
+
+/* Shift the result in the high 1536 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_3072_mont_shift_26(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int128 n = a[25] >> 36;
+    n += ((sp_int128)a[26]) << 24;
+
+    for (i = 0; i < 25; i++) {
+        r[i] = n & 0xfffffffffffffffL;
+        n >>= 60;
+        n += ((sp_int128)a[27 + i]) << 24;
+    }
+    r[25] = (sp_digit)n;
+    XMEMSET(&r[26], 0, sizeof(*r) * 26U);
+}
+
+/* Reduce the number back to 3072 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_3072_mont_reduce_26(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_3072_norm_26(a + 26);
+
+    for (i=0; i<25; i++) {
+        mu = (a[i] * mp) & 0xfffffffffffffffL;
+        sp_3072_mul_add_26(a+i, m, mu);
+        a[i+1] += a[i] >> 60;
+    }
+    mu = (a[i] * mp) & 0xfffffffffL;
+    sp_3072_mul_add_26(a+i, m, mu);
+    a[i+1] += a[i] >> 60;
+    a[i] &= 0xfffffffffffffffL;
+    sp_3072_mont_shift_26(a, a);
+    sp_3072_cond_sub_26(a, a, m, 0 - (((a[25] - m[25]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_3072_norm_26(a);
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_mul_26(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 lo;
+
+    c = ((sp_uint128)a[25]) * b[25];
+    r[51] = (sp_digit)(c >> 60);
+    c &= 0xfffffffffffffffL;
+    for (k = 49; k >= 0; k--) {
+        if (k >= 26) {
+            i = k - 25;
+            imax = 25;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint128)a[i]) * b[k - i];
+        }
+        c += lo >> 60;
+        r[k + 2] += (sp_digit)(c >> 60);
+        r[k + 1]  = (sp_digit)(c & 0xfffffffffffffffL);
+        c = lo & 0xfffffffffffffffL;
+    }
+    r[0] = (sp_digit)c;
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_mul_26(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_3072_mul_26(r, a, b);
+    sp_3072_mont_reduce_26(r, m, mp);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_3072_sqr_26(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 t;
+
+    c = ((sp_uint128)a[25]) * a[25];
+    r[51] = (sp_digit)(c >> 60);
+    c = (c & 0xfffffffffffffffL) << 60;
+    for (k = 49; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
+        }
+        if (k < 25) {
+            imax = k;
+        }
+        else {
+            imax = 25;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint128)a[i]) * a[k - i];
+        }
+        c += t * 2;
+
+        r[k + 2] += (sp_digit) (c >> 120);
+        r[k + 1]  = (sp_digit)((c >> 60) & 0xfffffffffffffffL);
+        c = (c & 0xfffffffffffffffL) << 60;
+    }
+    r[0] = (sp_digit)(c >> 60);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_sqr_26(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_3072_sqr_26(r, a);
+    sp_3072_mont_reduce_26(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_d_26(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 26; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0xfffffffffffffffL);
+        t >>= 60;
+    }
+    r[26] = (sp_digit)t;
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_add_26(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 26; i++) {
+        r[i] = a[i] + (b[i] & m);
+    }
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_26(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 26; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_3072_rshift_26(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<25; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (60 - n))) & 0xfffffffffffffffL;
+    }
+    r[25] = a[25] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_64
+static WC_INLINE sp_digit sp_3072_div_word_26(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 60 bits from d1 and top 3 bits from d0. */
+    d = (d1 << 3) + (d0 >> 57);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 4 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 54) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 51) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 10 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 48) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 45) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 16 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 42) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 39) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 22 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 36) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 33) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 28 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 30) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 31 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 27) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 34 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 24) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 37 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 21) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 40 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 18) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 43 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 15) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 46 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 12) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 49 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 9) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 52 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 6) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 55 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 3) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 58 bits in r */
+    /* Remaining 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += d0 & ((1 << 3) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 60 bits from d1 and top 3 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_64 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_div_26(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_64
+    sp_int128 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 26 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 26 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 52 + 1;
+        sd = t2 + 26 + 1;
+
+        sp_3072_mul_d_26(sd, d, (sp_digit)1 << 24);
+        sp_3072_mul_d_52(t1, a, (sp_digit)1 << 24);
+        dv = sd[25];
+        t1[26 + 26] += t1[26 + 26 - 1] >> 60;
+        t1[26 + 26 - 1] &= 0xfffffffffffffffL;
+        for (i=26; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = t1[26 + i];
+            d1 <<= 60;
+            d1 += t1[26 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_26(t1[26 + i], t1[26 + i - 1], dv);
+#endif
+
+            sp_3072_mul_d_26(t2, sd, r1);
+            (void)sp_3072_sub_26(&t1[i], &t1[i], t2);
+            sp_3072_norm_26(&t1[i]);
+            t1[26 + i] -= t2[26];
+            t1[26 + i] += t1[26 + i - 1] >> 60;
+            t1[26 + i - 1] &= 0xfffffffffffffffL;
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[26 + i];
+            d1 <<= 60;
+            d1 -= t1[26 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_26(-t1[26 + i], -t1[26 + i - 1], dv);
+#endif
+            r1 -= t1[26 + i];
+            sp_3072_mul_d_26(t2, sd, r1);
+            (void)sp_3072_add_26(&t1[i], &t1[i], t2);
+            t1[26 + i] += t1[26 + i - 1] >> 60;
+            t1[26 + i - 1] &= 0xfffffffffffffffL;
+        }
+        t1[26 - 1] += t1[26 - 2] >> 60;
+        t1[26 - 2] &= 0xfffffffffffffffL;
+        r1 = t1[26 - 1] / dv;
+
+        sp_3072_mul_d_26(t2, sd, r1);
+        sp_3072_sub_26(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 52U);
+        for (i=0; i<25; i++) {
+            r[i+1] += r[i] >> 60;
+            r[i] &= 0xfffffffffffffffL;
+        }
+        sp_3072_cond_add_26(r, r, sd, 0 - ((r[25] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_3072_norm_26(r);
+        sp_3072_rshift_26(r, r, 24);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_mod_26(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_3072_div_26(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_26(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 52];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 26 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 26 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 26U * 2U);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_26(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_26(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 26U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_3072_mul_26(t[1], t[1], norm);
+        err = sp_3072_mod_26(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 60;
+        c = bits % 60;
+        n = e[i--] << (60 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 60;
+            }
+
+            y = (int)((n >> 59) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_26(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 26 * 2);
+            sp_3072_mont_sqr_26(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 26 * 2);
+        }
+
+        sp_3072_mont_reduce_26(t[0], m, mp);
+        n = sp_3072_cmp_26(t[0], m);
+        sp_3072_cond_sub_26(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 26 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 52];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 26 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 26 * 2);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_26(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_26(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_26(t[1], t[1], norm);
+                err = sp_3072_mod_26(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_26(t[1], a, norm);
+            err = sp_3072_mod_26(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 60;
+        c = bits % 60;
+        n = e[i--] << (60 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 60;
+            }
+
+            y = (int)((n >> 59) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_26(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 26 * 2);
+            sp_3072_mont_sqr_26(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 26 * 2);
+        }
+
+        sp_3072_mont_reduce_26(t[0], m, mp);
+        n = sp_3072_cmp_26(t[0], m);
+        sp_3072_cond_sub_26(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 26 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(32 * 52) + 52];
+#endif
+    sp_digit* t[32];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 52) + 52), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<32; i++)
+            t[i] = td + i * 52;
+        rt = td + 1664;
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_26(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_26(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_26(t[1], t[1], norm);
+                err = sp_3072_mod_26(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_26(t[1], a, norm);
+            err = sp_3072_mod_26(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_26(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_26(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_26(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_26(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_26(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_26(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_3072_mont_sqr_26(t[ 8], t[ 4], m, mp);
+        sp_3072_mont_mul_26(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_3072_mont_sqr_26(t[10], t[ 5], m, mp);
+        sp_3072_mont_mul_26(t[11], t[ 6], t[ 5], m, mp);
+        sp_3072_mont_sqr_26(t[12], t[ 6], m, mp);
+        sp_3072_mont_mul_26(t[13], t[ 7], t[ 6], m, mp);
+        sp_3072_mont_sqr_26(t[14], t[ 7], m, mp);
+        sp_3072_mont_mul_26(t[15], t[ 8], t[ 7], m, mp);
+        sp_3072_mont_sqr_26(t[16], t[ 8], m, mp);
+        sp_3072_mont_mul_26(t[17], t[ 9], t[ 8], m, mp);
+        sp_3072_mont_sqr_26(t[18], t[ 9], m, mp);
+        sp_3072_mont_mul_26(t[19], t[10], t[ 9], m, mp);
+        sp_3072_mont_sqr_26(t[20], t[10], m, mp);
+        sp_3072_mont_mul_26(t[21], t[11], t[10], m, mp);
+        sp_3072_mont_sqr_26(t[22], t[11], m, mp);
+        sp_3072_mont_mul_26(t[23], t[12], t[11], m, mp);
+        sp_3072_mont_sqr_26(t[24], t[12], m, mp);
+        sp_3072_mont_mul_26(t[25], t[13], t[12], m, mp);
+        sp_3072_mont_sqr_26(t[26], t[13], m, mp);
+        sp_3072_mont_mul_26(t[27], t[14], t[13], m, mp);
+        sp_3072_mont_sqr_26(t[28], t[14], m, mp);
+        sp_3072_mont_mul_26(t[29], t[15], t[14], m, mp);
+        sp_3072_mont_sqr_26(t[30], t[15], m, mp);
+        sp_3072_mont_mul_26(t[31], t[16], t[15], m, mp);
+
+        bits = ((bits + 4) / 5) * 5;
+        i = ((bits + 59) / 60) - 1;
+        c = bits % 60;
+        if (c == 0) {
+            c = 60;
+        }
+        if (i < 26) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 5) {
+            n |= e[i--] << (4 - c);
+            c += 60;
+        }
+        y = (int)((n >> 59) & 0x1f);
+        n <<= 5;
+        c -= 5;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 52);
+        while ((i >= 0) || (c >= 5)) {
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 4;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 55;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 4;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 60 - c;
+            }
+
+            sp_3072_mont_sqr_26(rt, rt, m, mp);
+            sp_3072_mont_sqr_26(rt, rt, m, mp);
+            sp_3072_mont_sqr_26(rt, rt, m, mp);
+            sp_3072_mont_sqr_26(rt, rt, m, mp);
+            sp_3072_mont_sqr_26(rt, rt, m, mp);
+
+            sp_3072_mont_mul_26(rt, rt, t[y], m, mp);
+        }
+
+        sp_3072_mont_reduce_26(rt, m, mp);
+        n = sp_3072_cmp_26(rt, m);
+        sp_3072_cond_sub_26(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 52);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_52(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 52; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 3072 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_3072_mont_norm_52(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i=0; i<51; i++) {
+        r[i] = 0xfffffffffffffffL;
+    }
+    r[51] = 0xfffL;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_3072_sub_52(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_3072_cmp_52(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i=51; i>=0; i--) {
+        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_sub_52(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 52; i++) {
+        r[i] = a[i] - (b[i] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_add_52(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t[4];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 48; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0xfffffffffffffffL;
+        t[1] += t[0] >> 60;
+        r[i+1] = t[1] & 0xfffffffffffffffL;
+        t[2] += t[1] >> 60;
+        r[i+2] = t[2] & 0xfffffffffffffffL;
+        t[3] += t[2] >> 60;
+        r[i+3] = t[3] & 0xfffffffffffffffL;
+        t[0]  = t[3] >> 60;
+    }
+    t[0] += (tb * a[48]) + r[48];
+    t[1]  = (tb * a[49]) + r[49];
+    t[2]  = (tb * a[50]) + r[50];
+    t[3]  = (tb * a[51]) + r[51];
+    r[48] = t[0] & 0xfffffffffffffffL;
+    t[1] += t[0] >> 60;
+    r[49] = t[1] & 0xfffffffffffffffL;
+    t[2] += t[1] >> 60;
+    r[50] = t[2] & 0xfffffffffffffffL;
+    t[3] += t[2] >> 60;
+    r[51] = t[3] & 0xfffffffffffffffL;
+    r[52] +=  (sp_digit)(t[3] >> 60);
+}
+
+/* Shift the result in the high 3072 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_3072_mont_shift_52(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int128 n = a[51] >> 12;
+    n += ((sp_int128)a[52]) << 48;
+
+    for (i = 0; i < 51; i++) {
+        r[i] = n & 0xfffffffffffffffL;
+        n >>= 60;
+        n += ((sp_int128)a[53 + i]) << 48;
+    }
+    r[51] = (sp_digit)n;
+    XMEMSET(&r[52], 0, sizeof(*r) * 52U);
+}
+
+/* Reduce the number back to 3072 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_3072_mont_reduce_52(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_3072_norm_52(a + 52);
+
+#ifdef WOLFSSL_SP_DH
+    if (mp != 1) {
+        for (i=0; i<51; i++) {
+            mu = (a[i] * mp) & 0xfffffffffffffffL;
+            sp_3072_mul_add_52(a+i, m, mu);
+            a[i+1] += a[i] >> 60;
+        }
+        mu = (a[i] * mp) & 0xfffL;
+        sp_3072_mul_add_52(a+i, m, mu);
+        a[i+1] += a[i] >> 60;
+        a[i] &= 0xfffffffffffffffL;
+    }
+    else {
+        for (i=0; i<51; i++) {
+            mu = a[i] & 0xfffffffffffffffL;
+            sp_3072_mul_add_52(a+i, m, mu);
+            a[i+1] += a[i] >> 60;
+        }
+        mu = a[i] & 0xfffL;
+        sp_3072_mul_add_52(a+i, m, mu);
+        a[i+1] += a[i] >> 60;
+        a[i] &= 0xfffffffffffffffL;
+    }
+#else
+    for (i=0; i<51; i++) {
+        mu = (a[i] * mp) & 0xfffffffffffffffL;
+        sp_3072_mul_add_52(a+i, m, mu);
+        a[i+1] += a[i] >> 60;
+    }
+    mu = (a[i] * mp) & 0xfffL;
+    sp_3072_mul_add_52(a+i, m, mu);
+    a[i+1] += a[i] >> 60;
+    a[i] &= 0xfffffffffffffffL;
+#endif
+    sp_3072_mont_shift_52(a, a);
+    sp_3072_cond_sub_52(a, a, m, 0 - (((a[51] - m[51]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_3072_norm_52(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_mul_52(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_3072_mul_52(r, a, b);
+    sp_3072_mont_reduce_52(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_3072_mont_sqr_52(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_3072_sqr_52(r, a);
+    sp_3072_mont_reduce_52(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_d_104(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 104; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0xfffffffffffffffL);
+        t >>= 60;
+    }
+    r[104] = (sp_digit)t;
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_3072_cond_add_52(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 26; i++) {
+        r[i] = a[i] + (b[i] & m);
+    }
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_52(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 52; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_3072_rshift_52(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<51; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (60 - n))) & 0xfffffffffffffffL;
+    }
+    r[51] = a[51] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_64
+static WC_INLINE sp_digit sp_3072_div_word_52(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 60 bits from d1 and top 3 bits from d0. */
+    d = (d1 << 3) + (d0 >> 57);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 4 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 54) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 7 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 51) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 10 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 48) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 45) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 16 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 42) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 19 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 39) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 22 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 36) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 33) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 28 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 30) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 31 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 27) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 34 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 24) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 37 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 21) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 40 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 18) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 43 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 15) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 46 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 12) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 49 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 9) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 52 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 6) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 55 bits in r */
+    /* Next 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += (d0 >> 3) & ((1 << 3) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 58 bits in r */
+    /* Remaining 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += d0 & ((1 << 3) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 60 bits from d1 and top 3 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_64 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_div_52(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_64
+    sp_int128 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 52 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 52 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 104 + 1;
+        sd = t2 + 52 + 1;
+
+        sp_3072_mul_d_52(sd, d, (sp_digit)1 << 48);
+        sp_3072_mul_d_104(t1, a, (sp_digit)1 << 48);
+        dv = sd[51];
+        t1[52 + 52] += t1[52 + 52 - 1] >> 60;
+        t1[52 + 52 - 1] &= 0xfffffffffffffffL;
+        for (i=52; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = t1[52 + i];
+            d1 <<= 60;
+            d1 += t1[52 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_52(t1[52 + i], t1[52 + i - 1], dv);
+#endif
+
+            sp_3072_mul_d_52(t2, sd, r1);
+            (void)sp_3072_sub_52(&t1[i], &t1[i], t2);
+            sp_3072_norm_52(&t1[i]);
+            t1[52 + i] -= t2[52];
+            t1[52 + i] += t1[52 + i - 1] >> 60;
+            t1[52 + i - 1] &= 0xfffffffffffffffL;
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[52 + i];
+            d1 <<= 60;
+            d1 -= t1[52 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_52(-t1[52 + i], -t1[52 + i - 1], dv);
+#endif
+            r1 -= t1[52 + i];
+            sp_3072_mul_d_52(t2, sd, r1);
+            (void)sp_3072_add_52(&t1[i], &t1[i], t2);
+            t1[52 + i] += t1[52 + i - 1] >> 60;
+            t1[52 + i - 1] &= 0xfffffffffffffffL;
+        }
+        t1[52 - 1] += t1[52 - 2] >> 60;
+        t1[52 - 2] &= 0xfffffffffffffffL;
+        r1 = t1[52 - 1] / dv;
+
+        sp_3072_mul_d_52(t2, sd, r1);
+        sp_3072_sub_52(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 104U);
+        for (i=0; i<51; i++) {
+            r[i+1] += r[i] >> 60;
+            r[i] &= 0xfffffffffffffffL;
+        }
+        sp_3072_cond_add_52(r, r, sd, 0 - ((r[51] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_3072_norm_52(r);
+        sp_3072_rshift_52(r, r, 48);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_3072_mod_52(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_3072_div_52(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_52(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 104];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 52 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 52 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 52U * 2U);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_52(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_52(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 52U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_3072_mul_52(t[1], t[1], norm);
+        err = sp_3072_mod_52(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 60;
+        c = bits % 60;
+        n = e[i--] << (60 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 60;
+            }
+
+            y = (int)((n >> 59) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_52(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 52 * 2);
+            sp_3072_mont_sqr_52(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 52 * 2);
+        }
+
+        sp_3072_mont_reduce_52(t[0], m, mp);
+        n = sp_3072_cmp_52(t[0], m);
+        sp_3072_cond_sub_52(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 52 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 104];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 52 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 52 * 2);
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_52(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_52(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_52(t[1], t[1], norm);
+                err = sp_3072_mod_52(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_52(t[1], a, norm);
+            err = sp_3072_mod_52(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 60;
+        c = bits % 60;
+        n = e[i--] << (60 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 60;
+            }
+
+            y = (int)((n >> 59) & 1);
+            n <<= 1;
+
+            sp_3072_mont_mul_52(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 52 * 2);
+            sp_3072_mont_sqr_52(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 52 * 2);
+        }
+
+        sp_3072_mont_reduce_52(t[0], m, mp);
+        n = sp_3072_cmp_52(t[0], m);
+        sp_3072_cond_sub_52(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 52 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(16 * 104) + 104];
+#endif
+    sp_digit* t[16];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 104) + 104), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<16; i++)
+            t[i] = td + i * 104;
+        rt = td + 1664;
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_52(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_3072_mod_52(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_52(t[1], t[1], norm);
+                err = sp_3072_mod_52(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_3072_mul_52(t[1], a, norm);
+            err = sp_3072_mod_52(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_52(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_52(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_52(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_52(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_52(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_52(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_3072_mont_sqr_52(t[ 8], t[ 4], m, mp);
+        sp_3072_mont_mul_52(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_3072_mont_sqr_52(t[10], t[ 5], m, mp);
+        sp_3072_mont_mul_52(t[11], t[ 6], t[ 5], m, mp);
+        sp_3072_mont_sqr_52(t[12], t[ 6], m, mp);
+        sp_3072_mont_mul_52(t[13], t[ 7], t[ 6], m, mp);
+        sp_3072_mont_sqr_52(t[14], t[ 7], m, mp);
+        sp_3072_mont_mul_52(t[15], t[ 8], t[ 7], m, mp);
+
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 59) / 60) - 1;
+        c = bits % 60;
+        if (c == 0) {
+            c = 60;
+        }
+        if (i < 52) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 4) {
+            n |= e[i--] << (4 - c);
+            c += 60;
+        }
+        y = (int)((n >> 60) & 0xf);
+        n <<= 4;
+        c -= 4;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 104);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 4;
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c = 56;
+            }
+            else {
+                y = (byte)((n >> 60) & 0xf);
+                n = e[i--] << 4;
+                c = 4 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 60 - c;
+            }
+
+            sp_3072_mont_sqr_52(rt, rt, m, mp);
+            sp_3072_mont_sqr_52(rt, rt, m, mp);
+            sp_3072_mont_sqr_52(rt, rt, m, mp);
+            sp_3072_mont_sqr_52(rt, rt, m, mp);
+
+            sp_3072_mont_mul_52(rt, rt, t[y], m, mp);
+        }
+
+        sp_3072_mont_reduce_52(rt, m, mp);
+        n = sp_3072_cmp_52(rt, m);
+        sp_3072_cond_sub_52(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 104);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#ifdef WOLFSSL_HAVE_SP_RSA
+/* RSA public key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * em      Public exponent.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 384 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
+    const mp_int* mm, byte* out, word32* outLen)
+{
+#ifdef WOLFSSL_SP_SMALL
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[52 * 5];
+#endif
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit* norm = NULL;
+    sp_digit e[1] = {0};
+    sp_digit mp;
+    int i;
+    int err = MP_OKAY;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 60) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 384U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 52 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        r = a + 52 * 2;
+        m = r + 52 * 2;
+        norm = r;
+
+        sp_3072_from_bin(a, 52, in, inLen);
+#if DIGIT_BIT >= 60
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(m, 52, mm);
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_52(norm, m);
+    }
+    if (err == MP_OKAY) {
+        sp_3072_mul_52(a, a, norm);
+        err = sp_3072_mod_52(a, a, m);
+    }
+    if (err == MP_OKAY) {
+        for (i=59; i>=0; i--) {
+            if ((e[0] >> i) != 0) {
+                break;
+            }
+        }
+
+        XMEMCPY(r, a, sizeof(sp_digit) * 52 * 2);
+        for (i--; i>=0; i--) {
+            sp_3072_mont_sqr_52(r, r, m, mp);
+
+            if (((e[0] >> i) & 1) == 1) {
+                sp_3072_mont_mul_52(r, r, a, m, mp);
+            }
+        }
+        sp_3072_mont_reduce_52(r, m, mp);
+        mp = sp_3072_cmp_52(r, m);
+        sp_3072_cond_sub_52(r, r, m, ((mp < 0) ?
+                    (sp_digit)1 : (sp_digit)0)- 1);
+
+        sp_3072_to_bin_52(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[52 * 5];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit e[1] = {0};
+    int err = MP_OKAY;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 60) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 384U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 52 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d;
+        r = a + 52 * 2;
+        m = r + 52 * 2;
+
+        sp_3072_from_bin(a, 52, in, inLen);
+#if DIGIT_BIT >= 60
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(m, 52, mm);
+
+        if (e[0] == 0x3) {
+            sp_3072_sqr_52(r, a);
+            err = sp_3072_mod_52(r, r, m);
+            if (err == MP_OKAY) {
+                sp_3072_mul_52(r, a, r);
+                err = sp_3072_mod_52(r, r, m);
+            }
+        }
+        else {
+            sp_digit* norm = r;
+            int i;
+            sp_digit mp;
+
+            sp_3072_mont_setup(m, &mp);
+            sp_3072_mont_norm_52(norm, m);
+
+            sp_3072_mul_52(a, a, norm);
+            err = sp_3072_mod_52(a, a, m);
+
+            if (err == MP_OKAY) {
+                for (i=59; i>=0; i--) {
+                    if ((e[0] >> i) != 0) {
+                        break;
+                    }
+                }
+
+                XMEMCPY(r, a, sizeof(sp_digit) * 104U);
+                for (i--; i>=0; i--) {
+                    sp_3072_mont_sqr_52(r, r, m, mp);
+
+                    if (((e[0] >> i) & 1) == 1) {
+                        sp_3072_mont_mul_52(r, r, a, m, mp);
+                    }
+                }
+                sp_3072_mont_reduce_52(r, m, mp);
+                mp = sp_3072_cmp_52(r, m);
+                sp_3072_cond_sub_52(r, r, m, ((mp < 0) ?
+                           (sp_digit)1 : (sp_digit)0) - 1);
+            }
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_to_bin_52(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+#ifndef WOLFSSL_RSA_PUBLIC_ONLY
+#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
+#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
+/* RSA private key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * dm      Private exponent.
+ * pm      First prime.
+ * qm      Second prime.
+ * dpm     First prime's CRT exponent.
+ * dqm     Second prime's CRT exponent.
+ * qim     Inverse of second prime mod p.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 384 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
+    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
+    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
+{
+#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit  d[52 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 3072) {
+           err = MP_READ_E;
+        }
+        else if (inLen > 384) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 52 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 52;
+        m = a + 104;
+        r = a;
+
+        sp_3072_from_bin(a, 52, in, inLen);
+        sp_3072_from_mp(d, 52, dm);
+        sp_3072_from_mp(m, 52, mm);
+        err = sp_3072_mod_exp_52(r, a, d, 3072, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_to_bin_52(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 52);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[52 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 3072) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 384U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 52 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 52;
+        m = a + 104;
+        r = a;
+
+        sp_3072_from_bin(a, 52, in, inLen);
+        sp_3072_from_mp(d, 52, dm);
+        sp_3072_from_mp(m, 52, mm);
+        err = sp_3072_mod_exp_52(r, a, d, 3072, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_to_bin_52(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 52);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#else
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[26 * 8];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 384) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 26 * 8, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+    if (err == MP_OKAY) {
+        p = a + 52;
+        qi = dq = dp = p + 26;
+        tmpa = qi + 26;
+        tmpb = tmpa + 52;
+        r = a;
+
+        sp_3072_from_bin(a, 52, in, inLen);
+        sp_3072_from_mp(p, 26, pm);
+        sp_3072_from_mp(dp, 26, dpm);
+        err = sp_3072_mod_exp_26(tmpa, a, dp, 1536, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(p, 26, qm);
+        sp_3072_from_mp(dq, 26, dqm);
+        err = sp_3072_mod_exp_26(tmpb, a, dq, 1536, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(p, 26, pm);
+        (void)sp_3072_sub_26(tmpa, tmpa, tmpb);
+        sp_3072_norm_26(tmpa);
+        sp_3072_cond_add_26(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[25] >> 63));
+        sp_3072_cond_add_26(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[25] >> 63));
+
+        sp_3072_from_mp(qi, 26, qim);
+        sp_3072_mul_26(tmpa, tmpa, qi);
+        err = sp_3072_mod_26(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_from_mp(p, 26, qm);
+        sp_3072_mul_26(tmpa, p, tmpa);
+        (void)sp_3072_add_52(r, tmpb, tmpa);
+        sp_3072_norm_52(r);
+
+        sp_3072_to_bin_52(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 26 * 8);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[26 * 13];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* q = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 384U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 384U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 3072) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 26 * 13, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        p = a + 52 * 2;
+        q = p + 26;
+        dp = q + 26;
+        dq = dp + 26;
+        qi = dq + 26;
+        tmpa = qi + 26;
+        tmpb = tmpa + 52;
+        r = a;
+
+        sp_3072_from_bin(a, 52, in, inLen);
+        sp_3072_from_mp(p, 26, pm);
+        sp_3072_from_mp(q, 26, qm);
+        sp_3072_from_mp(dp, 26, dpm);
+        sp_3072_from_mp(dq, 26, dqm);
+        sp_3072_from_mp(qi, 26, qim);
+
+        err = sp_3072_mod_exp_26(tmpa, a, dp, 1536, p, 1);
+    }
+    if (err == MP_OKAY) {
+        err = sp_3072_mod_exp_26(tmpb, a, dq, 1536, q, 1);
+    }
+
+    if (err == MP_OKAY) {
+        (void)sp_3072_sub_26(tmpa, tmpa, tmpb);
+        sp_3072_norm_26(tmpa);
+        sp_3072_cond_add_26(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[25] >> 63));
+        sp_3072_cond_add_26(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[25] >> 63));
+        sp_3072_mul_26(tmpa, tmpa, qi);
+        err = sp_3072_mod_26(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mul_26(tmpa, tmpa, q);
+        (void)sp_3072_add_52(r, tmpb, tmpa);
+        sp_3072_norm_52(r);
+
+        sp_3072_to_bin_52(r, out);
+        *outLen = 384;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 26 * 13);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+    #endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
+}
+
+#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
+#endif /* WOLFSSL_HAVE_SP_RSA */
+#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
+                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
+/* Convert an array of sp_digit to an mp_int.
+ *
+ * a  A single precision integer.
+ * r  A multi-precision integer.
+ */
+static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
+{
+    int err;
+
+    err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
+    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
+#if DIGIT_BIT == 60
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 52);
+        r->used = 52;
+        mp_clamp(r);
+#elif DIGIT_BIT < 60
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 52; i++) {
+            r->dp[j] |= (mp_digit)(a[i] << s);
+            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+            s = DIGIT_BIT - s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
+            while (s + DIGIT_BIT <= 60) {
+                s += DIGIT_BIT;
+                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+                if (s == SP_WORD_SIZE) {
+                    r->dp[j] = 0;
+                }
+                else {
+                    r->dp[j] = (mp_digit)(a[i] >> s);
+                }
+            }
+            s = 60 - s;
+        }
+        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#else
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 52; i++) {
+            r->dp[j] |= ((mp_digit)a[i]) << s;
+            if (s + 60 >= DIGIT_BIT) {
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
+                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+    #endif
+                s = DIGIT_BIT - s;
+                r->dp[++j] = a[i] >> s;
+                s = 60 - s;
+            }
+            else {
+                s += 60;
+            }
+        }
+        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#endif
+    }
+
+    return err;
+}
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base  Base. MP integer.
+ * exp   Exponent. MP integer.
+ * mod   Modulus. MP integer.
+ * res   Result. MP integer.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
+    mp_int* res)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int err = MP_OKAY;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[52 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 3072) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 52 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 52 * 2;
+        m = e + 52;
+        r = b;
+
+        sp_3072_from_mp(b, 52, base);
+        sp_3072_from_mp(e, 52, exp);
+        sp_3072_from_mp(m, 52, mod);
+
+        err = sp_3072_mod_exp_52(r, b, e, mp_count_bits(exp), m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_3072_to_mp(r, res);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 52U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[52 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 3072) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 52 * 4, NULL, DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 52 * 2;
+        m = e + 52;
+        r = b;
+
+        sp_3072_from_mp(b, 52, base);
+        sp_3072_from_mp(e, 52, exp);
+        sp_3072_from_mp(m, 52, mod);
+
+        err = sp_3072_mod_exp_52(r, b, e, expBits, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_3072_to_mp(r, res);
+    }
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 52U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+#endif
+}
+
+#ifdef WOLFSSL_HAVE_SP_DH
+
+#ifdef HAVE_FFDHE_3072
+SP_NOINLINE static void sp_3072_lshift_52(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    r[52] = a[51] >> (60 - n);
+    for (i=51; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (60 - n))) & 0xfffffffffffffffL;
+    }
+    r[0] = (a[0] << n) & 0xfffffffffffffffL;
+}
+
+/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_2_52(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[157];
+#endif
+    sp_digit* norm = NULL;
+    sp_digit* tmp = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit o;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 157, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        tmp  = td + 104;
+        XMEMSET(td, 0, sizeof(sp_digit) * 157);
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_52(norm, m);
+
+        bits = ((bits + 4) / 5) * 5;
+        i = ((bits + 59) / 60) - 1;
+        c = bits % 60;
+        if (c == 0) {
+            c = 60;
+        }
+        if (i < 52) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 5) {
+            n |= e[i--] << (4 - c);
+            c += 60;
+        }
+        y = (int)((n >> 59) & 0x1f);
+        n <<= 5;
+        c -= 5;
+        sp_3072_lshift_52(r, norm, (byte)y);
+        while ((i >= 0) || (c >= 5)) {
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 4;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 55;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 4;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 60 - c;
+            }
+
+            sp_3072_mont_sqr_52(r, r, m, mp);
+            sp_3072_mont_sqr_52(r, r, m, mp);
+            sp_3072_mont_sqr_52(r, r, m, mp);
+            sp_3072_mont_sqr_52(r, r, m, mp);
+            sp_3072_mont_sqr_52(r, r, m, mp);
+
+            sp_3072_lshift_52(r, r, (byte)y);
+            sp_3072_mul_d_52(tmp, norm, (r[52] << 48) + (r[51] >> 12));
+            r[52] = 0;
+            r[51] &= 0xfffL;
+            (void)sp_3072_add_52(r, r, tmp);
+            sp_3072_norm_52(r);
+            o = sp_3072_cmp_52(r, m);
+            sp_3072_cond_sub_52(r, r, m, ((o < 0) ?
+                                          (sp_digit)1 : (sp_digit)0) - 1);
+        }
+
+        sp_3072_mont_reduce_52(r, m, mp);
+        n = sp_3072_cmp_52(r, m);
+        sp_3072_cond_sub_52(r, r, m, ((n < 0) ?
+                                                (sp_digit)1 : (sp_digit)0) - 1);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+#endif /* HAVE_FFDHE_3072 */
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base     Base.
+ * exp      Array of bytes that is the exponent.
+ * expLen   Length of data, in bytes, in exponent.
+ * mod      Modulus.
+ * out      Buffer to hold big-endian bytes of exponentiation result.
+ *          Must be at least 384 bytes long.
+ * outLen   Length, in bytes, of exponentiation result.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
+    const mp_int* mod, byte* out, word32* outLen)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[52 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    word32 i;
+    int err = MP_OKAY;
+
+    if (mp_count_bits(base) > 3072) {
+        err = MP_READ_E;
+    }
+    else if (expLen > 384U) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 3072) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 52 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 52 * 2;
+        m = e + 52;
+        r = b;
+
+        sp_3072_from_mp(b, 52, base);
+        sp_3072_from_bin(e, 52, exp, expLen);
+        sp_3072_from_mp(m, 52, mod);
+
+    #ifdef HAVE_FFDHE_3072
+        if (base->used == 1 && base->dp[0] == 2U &&
+                ((m[51] << 20) | (m[50] >> 40)) == 0xffffffffL) {
+            err = sp_3072_mod_exp_2_52(r, e, expLen * 8U, m);
+        }
+        else {
+    #endif
+            err = sp_3072_mod_exp_52(r, b, e, expLen * 8U, m, 0);
+    #ifdef HAVE_FFDHE_3072
+        }
+    #endif
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_to_bin_52(r, out);
+        *outLen = 384;
+        for (i=0; i<384U && out[i] == 0U; i++) {
+            /* Search for first non-zero. */
+        }
+        *outLen -= i;
+        XMEMMOVE(out, out + i, *outLen);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 52U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+}
+#endif /* WOLFSSL_HAVE_SP_DH */
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base  Base. MP integer.
+ * exp   Exponent. MP integer.
+ * mod   Modulus. MP integer.
+ * res   Result. MP integer.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
+    mp_int* res)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int err = MP_OKAY;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[26 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 1536) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 1536) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 1536) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 26 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 26 * 2;
+        m = e + 26;
+        r = b;
+
+        sp_3072_from_mp(b, 26, base);
+        sp_3072_from_mp(e, 26, exp);
+        sp_3072_from_mp(m, 26, mod);
+
+        err = sp_3072_mod_exp_26(r, b, e, mp_count_bits(exp), m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        XMEMSET(r + 26, 0, sizeof(*r) * 26U);
+        err = sp_3072_to_mp(r, res);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 52U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[26 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 1536) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 1536) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 1536) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 26 * 4, NULL, DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 26 * 2;
+        m = e + 26;
+        r = b;
+
+        sp_3072_from_mp(b, 26, base);
+        sp_3072_from_mp(e, 26, exp);
+        sp_3072_from_mp(m, 26, mod);
+
+        err = sp_3072_mod_exp_26(r, b, e, expBits, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        XMEMSET(r + 26, 0, sizeof(*r) * 26U);
+        err = sp_3072_to_mp(r, res);
+    }
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 52U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+#endif
+}
+
+#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
+
+#else
 /* Read big endian unsigned byte array into r.
  *
  * r  A single precision integer.
@@ -4171,7 +10756,7 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_3072_to_bin(sp_digit* r, byte* a)
+static void sp_3072_to_bin_54(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -4209,6 +10794,51 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 57 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_27(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 24; i += 8) {
+        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffL;
+        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffL;
+        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffL;
+        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffL;
+        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffL;
+        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffL;
+        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
+        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
+    }
+    a[25] += a[24] >> 57; a[24] &= 0x1ffffffffffffffL;
+    a[26] += a[25] >> 57; a[25] &= 0x1ffffffffffffffL;
+}
+
+/* Normalize the values in each word to 57 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_54(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 48; i += 8) {
+        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffL;
+        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffL;
+        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffL;
+        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffL;
+        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffL;
+        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffL;
+        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
+        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
+    }
+    a[49] += a[48] >> 57; a[48] &= 0x1ffffffffffffffL;
+    a[50] += a[49] >> 57; a[49] &= 0x1ffffffffffffffL;
+    a[51] += a[50] >> 57; a[50] &= 0x1ffffffffffffffL;
+    a[52] += a[51] >> 57; a[51] &= 0x1ffffffffffffffL;
+    a[53] += a[52] >> 57; a[52] &= 0x1ffffffffffffffL;
+}
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -4219,87 +10849,87 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
 SP_NOINLINE static void sp_3072_mul_9(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
-    int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
-                 + ((int128_t)a[ 1]) * b[ 0];
-    int128_t t2   = ((int128_t)a[ 0]) * b[ 2]
-                 + ((int128_t)a[ 1]) * b[ 1]
-                 + ((int128_t)a[ 2]) * b[ 0];
-    int128_t t3   = ((int128_t)a[ 0]) * b[ 3]
-                 + ((int128_t)a[ 1]) * b[ 2]
-                 + ((int128_t)a[ 2]) * b[ 1]
-                 + ((int128_t)a[ 3]) * b[ 0];
-    int128_t t4   = ((int128_t)a[ 0]) * b[ 4]
-                 + ((int128_t)a[ 1]) * b[ 3]
-                 + ((int128_t)a[ 2]) * b[ 2]
-                 + ((int128_t)a[ 3]) * b[ 1]
-                 + ((int128_t)a[ 4]) * b[ 0];
-    int128_t t5   = ((int128_t)a[ 0]) * b[ 5]
-                 + ((int128_t)a[ 1]) * b[ 4]
-                 + ((int128_t)a[ 2]) * b[ 3]
-                 + ((int128_t)a[ 3]) * b[ 2]
-                 + ((int128_t)a[ 4]) * b[ 1]
-                 + ((int128_t)a[ 5]) * b[ 0];
-    int128_t t6   = ((int128_t)a[ 0]) * b[ 6]
-                 + ((int128_t)a[ 1]) * b[ 5]
-                 + ((int128_t)a[ 2]) * b[ 4]
-                 + ((int128_t)a[ 3]) * b[ 3]
-                 + ((int128_t)a[ 4]) * b[ 2]
-                 + ((int128_t)a[ 5]) * b[ 1]
-                 + ((int128_t)a[ 6]) * b[ 0];
-    int128_t t7   = ((int128_t)a[ 0]) * b[ 7]
-                 + ((int128_t)a[ 1]) * b[ 6]
-                 + ((int128_t)a[ 2]) * b[ 5]
-                 + ((int128_t)a[ 3]) * b[ 4]
-                 + ((int128_t)a[ 4]) * b[ 3]
-                 + ((int128_t)a[ 5]) * b[ 2]
-                 + ((int128_t)a[ 6]) * b[ 1]
-                 + ((int128_t)a[ 7]) * b[ 0];
-    int128_t t8   = ((int128_t)a[ 0]) * b[ 8]
-                 + ((int128_t)a[ 1]) * b[ 7]
-                 + ((int128_t)a[ 2]) * b[ 6]
-                 + ((int128_t)a[ 3]) * b[ 5]
-                 + ((int128_t)a[ 4]) * b[ 4]
-                 + ((int128_t)a[ 5]) * b[ 3]
-                 + ((int128_t)a[ 6]) * b[ 2]
-                 + ((int128_t)a[ 7]) * b[ 1]
-                 + ((int128_t)a[ 8]) * b[ 0];
-    int128_t t9   = ((int128_t)a[ 1]) * b[ 8]
-                 + ((int128_t)a[ 2]) * b[ 7]
-                 + ((int128_t)a[ 3]) * b[ 6]
-                 + ((int128_t)a[ 4]) * b[ 5]
-                 + ((int128_t)a[ 5]) * b[ 4]
-                 + ((int128_t)a[ 6]) * b[ 3]
-                 + ((int128_t)a[ 7]) * b[ 2]
-                 + ((int128_t)a[ 8]) * b[ 1];
-    int128_t t10  = ((int128_t)a[ 2]) * b[ 8]
-                 + ((int128_t)a[ 3]) * b[ 7]
-                 + ((int128_t)a[ 4]) * b[ 6]
-                 + ((int128_t)a[ 5]) * b[ 5]
-                 + ((int128_t)a[ 6]) * b[ 4]
-                 + ((int128_t)a[ 7]) * b[ 3]
-                 + ((int128_t)a[ 8]) * b[ 2];
-    int128_t t11  = ((int128_t)a[ 3]) * b[ 8]
-                 + ((int128_t)a[ 4]) * b[ 7]
-                 + ((int128_t)a[ 5]) * b[ 6]
-                 + ((int128_t)a[ 6]) * b[ 5]
-                 + ((int128_t)a[ 7]) * b[ 4]
-                 + ((int128_t)a[ 8]) * b[ 3];
-    int128_t t12  = ((int128_t)a[ 4]) * b[ 8]
-                 + ((int128_t)a[ 5]) * b[ 7]
-                 + ((int128_t)a[ 6]) * b[ 6]
-                 + ((int128_t)a[ 7]) * b[ 5]
-                 + ((int128_t)a[ 8]) * b[ 4];
-    int128_t t13  = ((int128_t)a[ 5]) * b[ 8]
-                 + ((int128_t)a[ 6]) * b[ 7]
-                 + ((int128_t)a[ 7]) * b[ 6]
-                 + ((int128_t)a[ 8]) * b[ 5];
-    int128_t t14  = ((int128_t)a[ 6]) * b[ 8]
-                 + ((int128_t)a[ 7]) * b[ 7]
-                 + ((int128_t)a[ 8]) * b[ 6];
-    int128_t t15  = ((int128_t)a[ 7]) * b[ 8]
-                 + ((int128_t)a[ 8]) * b[ 7];
-    int128_t t16  = ((int128_t)a[ 8]) * b[ 8];
+    sp_uint128 t0   = ((sp_uint128)a[ 0]) * b[ 0];
+    sp_uint128 t1   = ((sp_uint128)a[ 0]) * b[ 1]
+                 + ((sp_uint128)a[ 1]) * b[ 0];
+    sp_uint128 t2   = ((sp_uint128)a[ 0]) * b[ 2]
+                 + ((sp_uint128)a[ 1]) * b[ 1]
+                 + ((sp_uint128)a[ 2]) * b[ 0];
+    sp_uint128 t3   = ((sp_uint128)a[ 0]) * b[ 3]
+                 + ((sp_uint128)a[ 1]) * b[ 2]
+                 + ((sp_uint128)a[ 2]) * b[ 1]
+                 + ((sp_uint128)a[ 3]) * b[ 0];
+    sp_uint128 t4   = ((sp_uint128)a[ 0]) * b[ 4]
+                 + ((sp_uint128)a[ 1]) * b[ 3]
+                 + ((sp_uint128)a[ 2]) * b[ 2]
+                 + ((sp_uint128)a[ 3]) * b[ 1]
+                 + ((sp_uint128)a[ 4]) * b[ 0];
+    sp_uint128 t5   = ((sp_uint128)a[ 0]) * b[ 5]
+                 + ((sp_uint128)a[ 1]) * b[ 4]
+                 + ((sp_uint128)a[ 2]) * b[ 3]
+                 + ((sp_uint128)a[ 3]) * b[ 2]
+                 + ((sp_uint128)a[ 4]) * b[ 1]
+                 + ((sp_uint128)a[ 5]) * b[ 0];
+    sp_uint128 t6   = ((sp_uint128)a[ 0]) * b[ 6]
+                 + ((sp_uint128)a[ 1]) * b[ 5]
+                 + ((sp_uint128)a[ 2]) * b[ 4]
+                 + ((sp_uint128)a[ 3]) * b[ 3]
+                 + ((sp_uint128)a[ 4]) * b[ 2]
+                 + ((sp_uint128)a[ 5]) * b[ 1]
+                 + ((sp_uint128)a[ 6]) * b[ 0];
+    sp_uint128 t7   = ((sp_uint128)a[ 0]) * b[ 7]
+                 + ((sp_uint128)a[ 1]) * b[ 6]
+                 + ((sp_uint128)a[ 2]) * b[ 5]
+                 + ((sp_uint128)a[ 3]) * b[ 4]
+                 + ((sp_uint128)a[ 4]) * b[ 3]
+                 + ((sp_uint128)a[ 5]) * b[ 2]
+                 + ((sp_uint128)a[ 6]) * b[ 1]
+                 + ((sp_uint128)a[ 7]) * b[ 0];
+    sp_uint128 t8   = ((sp_uint128)a[ 0]) * b[ 8]
+                 + ((sp_uint128)a[ 1]) * b[ 7]
+                 + ((sp_uint128)a[ 2]) * b[ 6]
+                 + ((sp_uint128)a[ 3]) * b[ 5]
+                 + ((sp_uint128)a[ 4]) * b[ 4]
+                 + ((sp_uint128)a[ 5]) * b[ 3]
+                 + ((sp_uint128)a[ 6]) * b[ 2]
+                 + ((sp_uint128)a[ 7]) * b[ 1]
+                 + ((sp_uint128)a[ 8]) * b[ 0];
+    sp_uint128 t9   = ((sp_uint128)a[ 1]) * b[ 8]
+                 + ((sp_uint128)a[ 2]) * b[ 7]
+                 + ((sp_uint128)a[ 3]) * b[ 6]
+                 + ((sp_uint128)a[ 4]) * b[ 5]
+                 + ((sp_uint128)a[ 5]) * b[ 4]
+                 + ((sp_uint128)a[ 6]) * b[ 3]
+                 + ((sp_uint128)a[ 7]) * b[ 2]
+                 + ((sp_uint128)a[ 8]) * b[ 1];
+    sp_uint128 t10  = ((sp_uint128)a[ 2]) * b[ 8]
+                 + ((sp_uint128)a[ 3]) * b[ 7]
+                 + ((sp_uint128)a[ 4]) * b[ 6]
+                 + ((sp_uint128)a[ 5]) * b[ 5]
+                 + ((sp_uint128)a[ 6]) * b[ 4]
+                 + ((sp_uint128)a[ 7]) * b[ 3]
+                 + ((sp_uint128)a[ 8]) * b[ 2];
+    sp_uint128 t11  = ((sp_uint128)a[ 3]) * b[ 8]
+                 + ((sp_uint128)a[ 4]) * b[ 7]
+                 + ((sp_uint128)a[ 5]) * b[ 6]
+                 + ((sp_uint128)a[ 6]) * b[ 5]
+                 + ((sp_uint128)a[ 7]) * b[ 4]
+                 + ((sp_uint128)a[ 8]) * b[ 3];
+    sp_uint128 t12  = ((sp_uint128)a[ 4]) * b[ 8]
+                 + ((sp_uint128)a[ 5]) * b[ 7]
+                 + ((sp_uint128)a[ 6]) * b[ 6]
+                 + ((sp_uint128)a[ 7]) * b[ 5]
+                 + ((sp_uint128)a[ 8]) * b[ 4];
+    sp_uint128 t13  = ((sp_uint128)a[ 5]) * b[ 8]
+                 + ((sp_uint128)a[ 6]) * b[ 7]
+                 + ((sp_uint128)a[ 7]) * b[ 6]
+                 + ((sp_uint128)a[ 8]) * b[ 5];
+    sp_uint128 t14  = ((sp_uint128)a[ 6]) * b[ 8]
+                 + ((sp_uint128)a[ 7]) * b[ 7]
+                 + ((sp_uint128)a[ 8]) * b[ 6];
+    sp_uint128 t15  = ((sp_uint128)a[ 7]) * b[ 8]
+                 + ((sp_uint128)a[ 8]) * b[ 7];
+    sp_uint128 t16  = ((sp_uint128)a[ 8]) * b[ 8];
 
     t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffL;
     t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffL;
@@ -4328,51 +10958,51 @@ SP_NOINLINE static void sp_3072_mul_9(sp_digit* r, const sp_digit* a,
  */
 SP_NOINLINE static void sp_3072_sqr_9(sp_digit* r, const sp_digit* a)
 {
-    int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
-    int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
-    int128_t t2   = (((int128_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int128_t)a[ 1]) * a[ 1];
-    int128_t t3   = (((int128_t)a[ 0]) * a[ 3]
-                 +  ((int128_t)a[ 1]) * a[ 2]) * 2;
-    int128_t t4   = (((int128_t)a[ 0]) * a[ 4]
-                 +  ((int128_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int128_t)a[ 2]) * a[ 2];
-    int128_t t5   = (((int128_t)a[ 0]) * a[ 5]
-                 +  ((int128_t)a[ 1]) * a[ 4]
-                 +  ((int128_t)a[ 2]) * a[ 3]) * 2;
-    int128_t t6   = (((int128_t)a[ 0]) * a[ 6]
-                 +  ((int128_t)a[ 1]) * a[ 5]
-                 +  ((int128_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int128_t)a[ 3]) * a[ 3];
-    int128_t t7   = (((int128_t)a[ 0]) * a[ 7]
-                 +  ((int128_t)a[ 1]) * a[ 6]
-                 +  ((int128_t)a[ 2]) * a[ 5]
-                 +  ((int128_t)a[ 3]) * a[ 4]) * 2;
-    int128_t t8   = (((int128_t)a[ 0]) * a[ 8]
-                 +  ((int128_t)a[ 1]) * a[ 7]
-                 +  ((int128_t)a[ 2]) * a[ 6]
-                 +  ((int128_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int128_t)a[ 4]) * a[ 4];
-    int128_t t9   = (((int128_t)a[ 1]) * a[ 8]
-                 +  ((int128_t)a[ 2]) * a[ 7]
-                 +  ((int128_t)a[ 3]) * a[ 6]
-                 +  ((int128_t)a[ 4]) * a[ 5]) * 2;
-    int128_t t10  = (((int128_t)a[ 2]) * a[ 8]
-                 +  ((int128_t)a[ 3]) * a[ 7]
-                 +  ((int128_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int128_t)a[ 5]) * a[ 5];
-    int128_t t11  = (((int128_t)a[ 3]) * a[ 8]
-                 +  ((int128_t)a[ 4]) * a[ 7]
-                 +  ((int128_t)a[ 5]) * a[ 6]) * 2;
-    int128_t t12  = (((int128_t)a[ 4]) * a[ 8]
-                 +  ((int128_t)a[ 5]) * a[ 7]) * 2
-                 +  ((int128_t)a[ 6]) * a[ 6];
-    int128_t t13  = (((int128_t)a[ 5]) * a[ 8]
-                 +  ((int128_t)a[ 6]) * a[ 7]) * 2;
-    int128_t t14  = (((int128_t)a[ 6]) * a[ 8]) * 2
-                 +  ((int128_t)a[ 7]) * a[ 7];
-    int128_t t15  = (((int128_t)a[ 7]) * a[ 8]) * 2;
-    int128_t t16  =  ((int128_t)a[ 8]) * a[ 8];
+    sp_uint128 t0   =  ((sp_uint128)a[ 0]) * a[ 0];
+    sp_uint128 t1   = (((sp_uint128)a[ 0]) * a[ 1]) * 2;
+    sp_uint128 t2   = (((sp_uint128)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_uint128)a[ 1]) * a[ 1];
+    sp_uint128 t3   = (((sp_uint128)a[ 0]) * a[ 3]
+                 +  ((sp_uint128)a[ 1]) * a[ 2]) * 2;
+    sp_uint128 t4   = (((sp_uint128)a[ 0]) * a[ 4]
+                 +  ((sp_uint128)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_uint128)a[ 2]) * a[ 2];
+    sp_uint128 t5   = (((sp_uint128)a[ 0]) * a[ 5]
+                 +  ((sp_uint128)a[ 1]) * a[ 4]
+                 +  ((sp_uint128)a[ 2]) * a[ 3]) * 2;
+    sp_uint128 t6   = (((sp_uint128)a[ 0]) * a[ 6]
+                 +  ((sp_uint128)a[ 1]) * a[ 5]
+                 +  ((sp_uint128)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_uint128)a[ 3]) * a[ 3];
+    sp_uint128 t7   = (((sp_uint128)a[ 0]) * a[ 7]
+                 +  ((sp_uint128)a[ 1]) * a[ 6]
+                 +  ((sp_uint128)a[ 2]) * a[ 5]
+                 +  ((sp_uint128)a[ 3]) * a[ 4]) * 2;
+    sp_uint128 t8   = (((sp_uint128)a[ 0]) * a[ 8]
+                 +  ((sp_uint128)a[ 1]) * a[ 7]
+                 +  ((sp_uint128)a[ 2]) * a[ 6]
+                 +  ((sp_uint128)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_uint128)a[ 4]) * a[ 4];
+    sp_uint128 t9   = (((sp_uint128)a[ 1]) * a[ 8]
+                 +  ((sp_uint128)a[ 2]) * a[ 7]
+                 +  ((sp_uint128)a[ 3]) * a[ 6]
+                 +  ((sp_uint128)a[ 4]) * a[ 5]) * 2;
+    sp_uint128 t10  = (((sp_uint128)a[ 2]) * a[ 8]
+                 +  ((sp_uint128)a[ 3]) * a[ 7]
+                 +  ((sp_uint128)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_uint128)a[ 5]) * a[ 5];
+    sp_uint128 t11  = (((sp_uint128)a[ 3]) * a[ 8]
+                 +  ((sp_uint128)a[ 4]) * a[ 7]
+                 +  ((sp_uint128)a[ 5]) * a[ 6]) * 2;
+    sp_uint128 t12  = (((sp_uint128)a[ 4]) * a[ 8]
+                 +  ((sp_uint128)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_uint128)a[ 6]) * a[ 6];
+    sp_uint128 t13  = (((sp_uint128)a[ 5]) * a[ 8]
+                 +  ((sp_uint128)a[ 6]) * a[ 7]) * 2;
+    sp_uint128 t14  = (((sp_uint128)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_uint128)a[ 7]) * a[ 7];
+    sp_uint128 t15  = (((sp_uint128)a[ 7]) * a[ 8]) * 2;
+    sp_uint128 t16  =  ((sp_uint128)a[ 8]) * a[ 8];
 
     t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffL;
     t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffL;
@@ -4416,33 +11046,6 @@ SP_NOINLINE static int sp_3072_add_9(sp_digit* r, const sp_digit* a,
     return 0;
 }
 
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_add_18(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 16; i += 8) {
-        r[i + 0] = a[i + 0] + b[i + 0];
-        r[i + 1] = a[i + 1] + b[i + 1];
-        r[i + 2] = a[i + 2] + b[i + 2];
-        r[i + 3] = a[i + 3] + b[i + 3];
-        r[i + 4] = a[i + 4] + b[i + 4];
-        r[i + 5] = a[i + 5] + b[i + 5];
-        r[i + 6] = a[i + 6] + b[i + 6];
-        r[i + 7] = a[i + 7] + b[i + 7];
-    }
-    r[16] = a[16] + b[16];
-    r[17] = a[17] + b[17];
-
-    return 0;
-}
-
 /* Sub b from a into r. (r = a - b)
  *
  * r  A single precision integer.
@@ -4470,91 +11073,18 @@ SP_NOINLINE static int sp_3072_sub_18(sp_digit* r, const sp_digit* a,
     return 0;
 }
 
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_mul_18(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    sp_digit* z0 = r;
-    sp_digit z1[18];
-    sp_digit* a1 = z1;
-    sp_digit b1[9];
-    sp_digit* z2 = r + 18;
-    (void)sp_3072_add_9(a1, a, &a[9]);
-    (void)sp_3072_add_9(b1, b, &b[9]);
-    sp_3072_mul_9(z2, &a[9], &b[9]);
-    sp_3072_mul_9(z0, a, b);
-    sp_3072_mul_9(z1, a1, b1);
-    (void)sp_3072_sub_18(z1, z1, z2);
-    (void)sp_3072_sub_18(z1, z1, z0);
-    (void)sp_3072_add_18(r + 9, r + 9, z1);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_sqr_18(sp_digit* r, const sp_digit* a)
-{
-    sp_digit* z0 = r;
-    sp_digit z1[18];
-    sp_digit* a1 = z1;
-    sp_digit* z2 = r + 18;
-    (void)sp_3072_add_9(a1, a, &a[9]);
-    sp_3072_sqr_9(z2, &a[9]);
-    sp_3072_sqr_9(z0, a);
-    sp_3072_sqr_9(z1, a1);
-    (void)sp_3072_sub_18(z1, z1, z2);
-    (void)sp_3072_sub_18(z1, z1, z0);
-    (void)sp_3072_add_18(r + 9, r + 9, z1);
-}
-
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_sub_36(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 32; i += 8) {
-        r[i + 0] = a[i + 0] - b[i + 0];
-        r[i + 1] = a[i + 1] - b[i + 1];
-        r[i + 2] = a[i + 2] - b[i + 2];
-        r[i + 3] = a[i + 3] - b[i + 3];
-        r[i + 4] = a[i + 4] - b[i + 4];
-        r[i + 5] = a[i + 5] - b[i + 5];
-        r[i + 6] = a[i + 6] - b[i + 6];
-        r[i + 7] = a[i + 7] - b[i + 7];
-    }
-    r[32] = a[32] - b[32];
-    r[33] = a[33] - b[33];
-    r[34] = a[34] - b[34];
-    r[35] = a[35] - b[35];
-
-    return 0;
-}
-
 /* Add b to a into r. (r = a + b)
  *
  * r  A single precision integer.
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_3072_add_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 32; i += 8) {
+    for (i = 0; i < 16; i += 8) {
         r[i + 0] = a[i + 0] + b[i + 0];
         r[i + 1] = a[i + 1] + b[i + 1];
         r[i + 2] = a[i + 2] + b[i + 2];
@@ -4564,10 +11094,8 @@ SP_NOINLINE static int sp_3072_add_36(sp_digit* r, const sp_digit* a,
         r[i + 6] = a[i + 6] + b[i + 6];
         r[i + 7] = a[i + 7] + b[i + 7];
     }
-    r[32] = a[32] + b[32];
-    r[33] = a[33] + b[33];
-    r[34] = a[34] + b[34];
-    r[35] = a[35] + b[35];
+    r[16] = a[16] + b[16];
+    r[17] = a[17] + b[17];
 
     return 0;
 }
@@ -4578,48 +11106,48 @@ SP_NOINLINE static int sp_3072_add_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_mul_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_27(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    sp_digit p0[36];
-    sp_digit p1[36];
-    sp_digit p2[36];
-    sp_digit p3[36];
-    sp_digit p4[36];
-    sp_digit p5[36];
-    sp_digit t0[36];
-    sp_digit t1[36];
-    sp_digit t2[36];
-    sp_digit a0[18];
-    sp_digit a1[18];
-    sp_digit a2[18];
-    sp_digit b0[18];
-    sp_digit b1[18];
-    sp_digit b2[18];
-    (void)sp_3072_add_18(a0, a, &a[18]);
-    (void)sp_3072_add_18(b0, b, &b[18]);
-    (void)sp_3072_add_18(a1, &a[18], &a[36]);
-    (void)sp_3072_add_18(b1, &b[18], &b[36]);
-    (void)sp_3072_add_18(a2, a0, &a[36]);
-    (void)sp_3072_add_18(b2, b0, &b[36]);
-    sp_3072_mul_18(p0, a, b);
-    sp_3072_mul_18(p2, &a[18], &b[18]);
-    sp_3072_mul_18(p4, &a[36], &b[36]);
-    sp_3072_mul_18(p1, a0, b0);
-    sp_3072_mul_18(p3, a1, b1);
-    sp_3072_mul_18(p5, a2, b2);
-    XMEMSET(r, 0, sizeof(*r)*2U*54U);
-    (void)sp_3072_sub_36(t0, p3, p2);
-    (void)sp_3072_sub_36(t1, p1, p2);
-    (void)sp_3072_sub_36(t2, p5, t0);
-    (void)sp_3072_sub_36(t2, t2, t1);
-    (void)sp_3072_sub_36(t0, t0, p4);
-    (void)sp_3072_sub_36(t1, t1, p0);
-    (void)sp_3072_add_36(r, r, p0);
-    (void)sp_3072_add_36(&r[18], &r[18], t1);
-    (void)sp_3072_add_36(&r[36], &r[36], t2);
-    (void)sp_3072_add_36(&r[54], &r[54], t0);
-    (void)sp_3072_add_36(&r[72], &r[72], p4);
+    sp_digit p0[18];
+    sp_digit p1[18];
+    sp_digit p2[18];
+    sp_digit p3[18];
+    sp_digit p4[18];
+    sp_digit p5[18];
+    sp_digit t0[18];
+    sp_digit t1[18];
+    sp_digit t2[18];
+    sp_digit a0[9];
+    sp_digit a1[9];
+    sp_digit a2[9];
+    sp_digit b0[9];
+    sp_digit b1[9];
+    sp_digit b2[9];
+    (void)sp_3072_add_9(a0, a, &a[9]);
+    (void)sp_3072_add_9(b0, b, &b[9]);
+    (void)sp_3072_add_9(a1, &a[9], &a[18]);
+    (void)sp_3072_add_9(b1, &b[9], &b[18]);
+    (void)sp_3072_add_9(a2, a0, &a[18]);
+    (void)sp_3072_add_9(b2, b0, &b[18]);
+    sp_3072_mul_9(p0, a, b);
+    sp_3072_mul_9(p2, &a[9], &b[9]);
+    sp_3072_mul_9(p4, &a[18], &b[18]);
+    sp_3072_mul_9(p1, a0, b0);
+    sp_3072_mul_9(p3, a1, b1);
+    sp_3072_mul_9(p5, a2, b2);
+    XMEMSET(r, 0, sizeof(*r)*2U*27U);
+    (void)sp_3072_sub_18(t0, p3, p2);
+    (void)sp_3072_sub_18(t1, p1, p2);
+    (void)sp_3072_sub_18(t2, p5, t0);
+    (void)sp_3072_sub_18(t2, t2, t1);
+    (void)sp_3072_sub_18(t0, t0, p4);
+    (void)sp_3072_sub_18(t1, t1, p0);
+    (void)sp_3072_add_18(r, r, p0);
+    (void)sp_3072_add_18(&r[9], &r[9], t1);
+    (void)sp_3072_add_18(&r[18], &r[18], t2);
+    (void)sp_3072_add_18(&r[27], &r[27], t0);
+    (void)sp_3072_add_18(&r[36], &r[36], p4);
 }
 
 /* Square a into r. (r = a * a)
@@ -4627,63 +11155,71 @@ SP_NOINLINE static void sp_3072_mul_54(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-SP_NOINLINE static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
 {
-    sp_digit p0[36];
-    sp_digit p1[36];
-    sp_digit p2[36];
-    sp_digit p3[36];
-    sp_digit p4[36];
-    sp_digit p5[36];
-    sp_digit t0[36];
-    sp_digit t1[36];
-    sp_digit t2[36];
-    sp_digit a0[18];
-    sp_digit a1[18];
-    sp_digit a2[18];
-    (void)sp_3072_add_18(a0, a, &a[18]);
-    (void)sp_3072_add_18(a1, &a[18], &a[36]);
-    (void)sp_3072_add_18(a2, a0, &a[36]);
-    sp_3072_sqr_18(p0, a);
-    sp_3072_sqr_18(p2, &a[18]);
-    sp_3072_sqr_18(p4, &a[36]);
-    sp_3072_sqr_18(p1, a0);
-    sp_3072_sqr_18(p3, a1);
-    sp_3072_sqr_18(p5, a2);
-    XMEMSET(r, 0, sizeof(*r)*2U*54U);
-    (void)sp_3072_sub_36(t0, p3, p2);
-    (void)sp_3072_sub_36(t1, p1, p2);
-    (void)sp_3072_sub_36(t2, p5, t0);
-    (void)sp_3072_sub_36(t2, t2, t1);
-    (void)sp_3072_sub_36(t0, t0, p4);
-    (void)sp_3072_sub_36(t1, t1, p0);
-    (void)sp_3072_add_36(r, r, p0);
-    (void)sp_3072_add_36(&r[18], &r[18], t1);
-    (void)sp_3072_add_36(&r[36], &r[36], t2);
-    (void)sp_3072_add_36(&r[54], &r[54], t0);
-    (void)sp_3072_add_36(&r[72], &r[72], p4);
+    sp_digit p0[18];
+    sp_digit p1[18];
+    sp_digit p2[18];
+    sp_digit p3[18];
+    sp_digit p4[18];
+    sp_digit p5[18];
+    sp_digit t0[18];
+    sp_digit t1[18];
+    sp_digit t2[18];
+    sp_digit a0[9];
+    sp_digit a1[9];
+    sp_digit a2[9];
+    (void)sp_3072_add_9(a0, a, &a[9]);
+    (void)sp_3072_add_9(a1, &a[9], &a[18]);
+    (void)sp_3072_add_9(a2, a0, &a[18]);
+    sp_3072_sqr_9(p0, a);
+    sp_3072_sqr_9(p2, &a[9]);
+    sp_3072_sqr_9(p4, &a[18]);
+    sp_3072_sqr_9(p1, a0);
+    sp_3072_sqr_9(p3, a1);
+    sp_3072_sqr_9(p5, a2);
+    XMEMSET(r, 0, sizeof(*r)*2U*27U);
+    (void)sp_3072_sub_18(t0, p3, p2);
+    (void)sp_3072_sub_18(t1, p1, p2);
+    (void)sp_3072_sub_18(t2, p5, t0);
+    (void)sp_3072_sub_18(t2, t2, t1);
+    (void)sp_3072_sub_18(t0, t0, p4);
+    (void)sp_3072_sub_18(t1, t1, p0);
+    (void)sp_3072_add_18(r, r, p0);
+    (void)sp_3072_add_18(&r[9], &r[9], t1);
+    (void)sp_3072_add_18(&r[18], &r[18], t2);
+    (void)sp_3072_add_18(&r[27], &r[27], t0);
+    (void)sp_3072_add_18(&r[36], &r[36], p4);
 }
 
-#endif /* !WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
 /* Add b to a into r. (r = a + b)
  *
  * r  A single precision integer.
  * a  A single precision integer.
  * b  A single precision integer.
  */
-SP_NOINLINE static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
 
-    for (i = 0; i < 54; i++) {
-        r[i] = a[i] + b[i];
+    for (i = 0; i < 24; i += 8) {
+        r[i + 0] = a[i + 0] + b[i + 0];
+        r[i + 1] = a[i + 1] + b[i + 1];
+        r[i + 2] = a[i + 2] + b[i + 2];
+        r[i + 3] = a[i + 3] + b[i + 3];
+        r[i + 4] = a[i + 4] + b[i + 4];
+        r[i + 5] = a[i + 5] + b[i + 5];
+        r[i + 6] = a[i + 6] + b[i + 6];
+        r[i + 7] = a[i + 7] + b[i + 7];
     }
+    r[24] = a[24] + b[24];
+    r[25] = a[25] + b[25];
+    r[26] = a[26] + b[26];
 
     return 0;
 }
-#else
+
 /* Add b to a into r. (r = a + b)
  *
  * r  A single precision integer.
@@ -4715,27 +11251,6 @@ SP_NOINLINE static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
     return 0;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 54; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
-}
-
-#else
 /* Sub b from a into r. (r = a - b)
  *
  * r  A single precision integer.
@@ -4767,8 +11282,6 @@ SP_NOINLINE static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
     return 0;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
  * r  A single precision integer.
@@ -4778,31 +11291,19 @@ SP_NOINLINE static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_3072_mul_54(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[53]) * b[53];
-    r[107] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
-    for (k = 105; k >= 0; k--) {
-        for (i = 53; i >= 0; i--) {
-            j = k - i;
-            if (j >= 54) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
-        }
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
-    }
-    r[0] = (sp_digit)(c >> 57);
+    sp_digit* z0 = r;
+    sp_digit z1[54];
+    sp_digit* a1 = z1;
+    sp_digit b1[27];
+    sp_digit* z2 = r + 54;
+    (void)sp_3072_add_27(a1, a, &a[27]);
+    (void)sp_3072_add_27(b1, b, &b[27]);
+    sp_3072_mul_27(z2, &a[27], &b[27]);
+    sp_3072_mul_27(z0, a, b);
+    sp_3072_mul_27(z1, a1, b1);
+    (void)sp_3072_sub_54(z1, z1, z2);
+    (void)sp_3072_sub_54(z1, z1, z0);
+    (void)sp_3072_add_54(r + 27, r + 27, z1);
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -4812,269 +11313,20 @@ SP_NOINLINE static void sp_3072_mul_54(sp_digit* r, const sp_digit* a,
  */
 SP_NOINLINE static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
 {
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[53]) * a[53];
-    r[107] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
-    for (k = 105; k >= 0; k--) {
-        for (i = 53; i >= 0; i--) {
-            j = k - i;
-            if (j >= 54 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
-        }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
-    }
-    r[0] = (sp_digit)(c >> 57);
+    sp_digit* z0 = r;
+    sp_digit z1[54];
+    sp_digit* a1 = z1;
+    sp_digit* z2 = r + 54;
+    (void)sp_3072_add_27(a1, a, &a[27]);
+    sp_3072_sqr_27(z2, &a[27]);
+    sp_3072_sqr_27(z0, a);
+    sp_3072_sqr_27(z1, a1);
+    (void)sp_3072_sub_54(z1, z1, z2);
+    (void)sp_3072_sub_54(z1, z1, z0);
+    (void)sp_3072_add_54(r + 27, r + 27, z1);
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 27; i++) {
-        r[i] = a[i] + b[i];
-    }
-
-    return 0;
-}
-#else
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 24; i += 8) {
-        r[i + 0] = a[i + 0] + b[i + 0];
-        r[i + 1] = a[i + 1] + b[i + 1];
-        r[i + 2] = a[i + 2] + b[i + 2];
-        r[i + 3] = a[i + 3] + b[i + 3];
-        r[i + 4] = a[i + 4] + b[i + 4];
-        r[i + 5] = a[i + 5] + b[i + 5];
-        r[i + 6] = a[i + 6] + b[i + 6];
-        r[i + 7] = a[i + 7] + b[i + 7];
-    }
-    r[24] = a[24] + b[24];
-    r[25] = a[25] + b[25];
-    r[26] = a[26] + b[26];
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 27; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
-}
-
-#else
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 24; i += 8) {
-        r[i + 0] = a[i + 0] - b[i + 0];
-        r[i + 1] = a[i + 1] - b[i + 1];
-        r[i + 2] = a[i + 2] - b[i + 2];
-        r[i + 3] = a[i + 3] - b[i + 3];
-        r[i + 4] = a[i + 4] - b[i + 4];
-        r[i + 5] = a[i + 5] - b[i + 5];
-        r[i + 6] = a[i + 6] - b[i + 6];
-        r[i + 7] = a[i + 7] - b[i + 7];
-    }
-    r[24] = a[24] - b[24];
-    r[25] = a[25] - b[25];
-    r[26] = a[26] - b[26];
-
-    return 0;
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_mul_27(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[26]) * b[26];
-    r[53] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
-    for (k = 51; k >= 0; k--) {
-        for (i = 26; i >= 0; i--) {
-            j = k - i;
-            if (j >= 27) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
-        }
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
-    }
-    r[0] = (sp_digit)(c >> 57);
-}
-
-#else
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_mul_27(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int128_t t[54];
-
-    XMEMSET(t, 0, sizeof(t));
-    for (i=0; i<27; i++) {
-        for (j=0; j<27; j++) {
-            t[i+j] += ((int128_t)a[i]) * b[j];
-        }
-    }
-    for (i=0; i<53; i++) {
-        r[i] = t[i] & 0x1ffffffffffffffL;
-        t[i+1] += t[i] >> 57;
-    }
-    r[53] = (sp_digit)t[53];
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[26]) * a[26];
-    r[53] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
-    for (k = 51; k >= 0; k--) {
-        for (i = 26; i >= 0; i--) {
-            j = k - i;
-            if (j >= 27 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
-        }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
-    }
-    r[0] = (sp_digit)(c >> 57);
-}
-
-#else
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int128_t t[54];
-
-    XMEMSET(t, 0, sizeof(t));
-    for (i=0; i<27; i++) {
-        for (j=0; j<i; j++) {
-            t[i+j] += (((int128_t)a[i]) * a[j]) * 2;
-        }
-        t[i+i] += ((int128_t)a[i]) * a[i];
-    }
-    for (i=0; i<53; i++) {
-        r[i] = t[i] & 0x1ffffffffffffffL;
-        t[i+1] += t[i] >> 57;
-    }
-    r[53] = (sp_digit)t[53];
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
-
+#endif /* !WOLFSSL_SP_SMALL */
 /* Caclulate the bottom digit of -1/a mod 2^n.
  *
  * a    A single precision number.
@@ -5106,22 +11358,10 @@ static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
 SP_NOINLINE static void sp_3072_mul_d_54(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 54; i++) {
-        t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1ffffffffffffffL);
-        t >>= 57;
-    }
-    r[54] = (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     sp_digit t2;
-    int128_t p[4];
+    sp_int128 p[4];
     int i;
 
     for (i = 0; i < 52; i += 4) {
@@ -5153,10 +11393,37 @@ SP_NOINLINE static void sp_3072_mul_d_54(sp_digit* r, const sp_digit* a,
     r[53] = (sp_digit)(t & 0x1ffffffffffffffL);
     t >>= 57;
     r[54] = (sp_digit)(t & 0x1ffffffffffffffL);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 24; i += 8) {
+        r[i + 0] = a[i + 0] - b[i + 0];
+        r[i + 1] = a[i + 1] - b[i + 1];
+        r[i + 2] = a[i + 2] - b[i + 2];
+        r[i + 3] = a[i + 3] - b[i + 3];
+        r[i + 4] = a[i + 4] - b[i + 4];
+        r[i + 5] = a[i + 5] - b[i + 5];
+        r[i + 6] = a[i + 6] - b[i + 6];
+        r[i + 7] = a[i + 7] - b[i + 7];
+    }
+    r[24] = a[24] - b[24];
+    r[25] = a[25] - b[25];
+    r[26] = a[26] - b[26];
+
+    return 0;
+}
+
 /* r = 2^n mod m where n is the number of bits to reduce by.
  * Given m must be 3072 bits, just need to subtract.
  *
@@ -5166,13 +11433,6 @@ SP_NOINLINE static void sp_3072_mul_d_54(sp_digit* r, const sp_digit* a,
 static void sp_3072_mont_norm_27(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=0; i<26; i++) {
-        r[i] = 0x1ffffffffffffffL;
-    }
-#else
     int i;
 
     for (i = 0; i < 24; i += 8) {
@@ -5187,7 +11447,6 @@ static void sp_3072_mont_norm_27(sp_digit* r, const sp_digit* m)
     }
     r[24] = 0x1ffffffffffffffL;
     r[25] = 0x1ffffffffffffffL;
-#endif
     r[26] = 0x3fffffffffffffL;
 
     /* r = (2^n - 1) mod n */
@@ -5207,13 +11466,6 @@ static void sp_3072_mont_norm_27(sp_digit* r, const sp_digit* m)
 static sp_digit sp_3072_cmp_27(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=26; i>=0; i--) {
-        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#else
     int i;
 
     r |= (a[26] - b[26]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
@@ -5229,7 +11481,6 @@ static sp_digit sp_3072_cmp_27(const sp_digit* a, const sp_digit* b)
         r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -5245,13 +11496,6 @@ static sp_digit sp_3072_cmp_27(const sp_digit* a, const sp_digit* b)
 static void sp_3072_cond_sub_27(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 27; i++) {
-        r[i] = a[i] - (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 24; i += 8) {
@@ -5267,7 +11511,6 @@ static void sp_3072_cond_sub_27(sp_digit* r, const sp_digit* a,
     r[24] = a[24] - (b[24] & m);
     r[25] = a[25] - (b[25] & m);
     r[26] = a[26] - (b[26] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -5279,20 +11522,8 @@ static void sp_3072_cond_sub_27(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_3072_mul_add_27(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 27; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1ffffffffffffffL;
-        t >>= 57;
-    }
-    r[27] += (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t[8];
+    sp_int128 tb = b;
+    sp_int128 t[8];
     int i;
 
     t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1ffffffffffffffL);
@@ -5319,36 +11550,6 @@ SP_NOINLINE static void sp_3072_mul_add_27(sp_digit* r, const sp_digit* a,
     t[2] = tb * a[26];
     r[26] += (sp_digit)((t[1] >> 57) + (t[2] & 0x1ffffffffffffffL));
     r[27] +=  (sp_digit)(t[2] >> 57);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 57.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_3072_norm_27(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 26; i++) {
-        a[i+1] += a[i] >> 57;
-        a[i] &= 0x1ffffffffffffffL;
-    }
-#else
-    int i;
-    for (i = 0; i < 24; i += 8) {
-        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffL;
-        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffL;
-        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffL;
-        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffL;
-        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffL;
-        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffL;
-        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
-        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
-    }
-    a[24+1] += a[24] >> 57; a[24] &= 0x1ffffffffffffffL;
-    a[25+1] += a[25] >> 57; a[25] &= 0x1ffffffffffffffL;
-#endif
 }
 
 /* Shift the result in the high 1536 bits down to the bottom.
@@ -5358,22 +11559,6 @@ static void sp_3072_norm_27(sp_digit* a)
  */
 static void sp_3072_mont_shift_27(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    sp_digit n;
-    sp_digit s;
-
-    s = a[27];
-    n = a[26] >> 54;
-    for (i = 0; i < 26; i++) {
-        n += (s & 0x1ffffffffffffffL) << 3;
-        r[i] = n & 0x1ffffffffffffffL;
-        n >>= 57;
-        s = a[28 + i] + (s >> 57);
-    }
-    n += s << 3;
-    r[26] = n;
-#else
     sp_digit n;
     sp_digit s;
     int i;
@@ -5402,7 +11587,6 @@ static void sp_3072_mont_shift_27(sp_digit* r, const sp_digit* a)
     n += (s & 0x1ffffffffffffffL) << 3; r[25] = n & 0x1ffffffffffffffL;
     n >>= 57; s = a[53] + (s >> 57);
     n += s << 3;              r[26] = n;
-#endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[27], 0, sizeof(*r) * 27U);
 }
 
@@ -5429,7 +11613,7 @@ static void sp_3072_mont_reduce_27(sp_digit* a, const sp_digit* m, sp_digit mp)
     a[i+1] += a[i] >> 57;
     a[i] &= 0x1ffffffffffffffL;
     sp_3072_mont_shift_27(a, a);
-    sp_3072_cond_sub_27(a, a, m, 0 - (((a[26] >> 54) > 0) ?
+    sp_3072_cond_sub_27(a, a, m, 0 - (((a[26] - m[26]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
     sp_3072_norm_27(a);
 }
@@ -5473,22 +11657,10 @@ static void sp_3072_mont_sqr_27(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 27; i++) {
-        t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1ffffffffffffffL);
-        t >>= 57;
-    }
-    r[27] = (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     sp_digit t2;
-    int128_t p[4];
+    sp_int128 p[4];
     int i;
 
     for (i = 0; i < 24; i += 4) {
@@ -5523,7 +11695,6 @@ SP_NOINLINE static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a,
     r[26] = (sp_digit)(t & 0x1ffffffffffffffL);
     t >>= 57;
     r[27] = (sp_digit)(t & 0x1ffffffffffffffL);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Conditionally add a and b using the mask m.
@@ -5537,13 +11708,6 @@ SP_NOINLINE static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a,
 static void sp_3072_cond_add_27(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 27; i++) {
-        r[i] = a[i] + (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 24; i += 8) {
@@ -5559,7 +11723,26 @@ static void sp_3072_cond_add_27(sp_digit* r, const sp_digit* a,
     r[24] = a[24] + (b[24] & m);
     r[25] = a[25] + (b[25] & m);
     r[26] = a[26] + (b[26] & m);
-#endif /* WOLFSSL_SP_SMALL */
+}
+
+SP_NOINLINE static void sp_3072_rshift_27(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<24; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (57 - n)) & 0x1ffffffffffffffL);
+    }
+    r[24] = (a[24] >> n) | ((a[25] << (57 - n)) & 0x1ffffffffffffffL);
+    r[25] = (a[25] >> n) | ((a[26] << (57 - n)) & 0x1ffffffffffffffL);
+    r[26] = a[26] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_64
@@ -5654,7 +11837,7 @@ static WC_INLINE sp_digit sp_3072_div_word_27(sp_digit d1, sp_digit d0,
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Full implementation.
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -5662,40 +11845,45 @@ static WC_INLINE sp_digit sp_3072_div_word_27(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_3072_div_27(const sp_digit* a, const sp_digit* d, 
+static int sp_3072_div_27(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
+    sp_int128 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 27 + 1];
+    sp_digit t1[4 * 27 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 27 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 27 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
-    if (err == MP_OKAY) {
-        t2 = t1 + 2 * 27;
+    (void)m;
 
-        dv = d[26];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 27U);
-        for (i=26; i>=0; i--) {
-            t1[27 + i] += t1[27 + i - 1] >> 57;
-            t1[27 + i - 1] &= 0x1ffffffffffffffL;
+    if (err == MP_OKAY) {
+        t2 = t1 + 54 + 1;
+        sd = t2 + 27 + 1;
+
+        sp_3072_mul_d_27(sd, d, (sp_digit)1 << 3);
+        sp_3072_mul_d_54(t1, a, (sp_digit)1 << 3);
+        dv = sd[26];
+        t1[27 + 27] += t1[27 + 27 - 1] >> 57;
+        t1[27 + 27 - 1] &= 0x1ffffffffffffffL;
+        for (i=27; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_64
             d1 = t1[27 + i];
             d1 <<= 57;
@@ -5705,15 +11893,22 @@ static int sp_3072_div_27(const sp_digit* a, const sp_digit* d,
             r1 = sp_3072_div_word_27(t1[27 + i], t1[27 + i - 1], dv);
 #endif
 
-            sp_3072_mul_d_27(t2, d, r1);
+            sp_3072_mul_d_27(t2, sd, r1);
             (void)sp_3072_sub_27(&t1[i], &t1[i], t2);
             sp_3072_norm_27(&t1[i]);
             t1[27 + i] -= t2[27];
             t1[27 + i] += t1[27 + i - 1] >> 57;
             t1[27 + i - 1] &= 0x1ffffffffffffffL;
-            r1 = (((-t1[27 + i]) << 57) - t1[27 + i - 1]) / dv;
-            r1++;
-            sp_3072_mul_d_27(t2, d, r1);
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[27 + i];
+            d1 <<= 57;
+            d1 -= t1[27 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_27(-t1[27 + i], -t1[27 + i - 1], dv);
+#endif
+            r1 -= t1[27 + i];
+            sp_3072_mul_d_27(t2, sd, r1);
             (void)sp_3072_add_27(&t1[i], &t1[i], t2);
             t1[27 + i] += t1[27 + i - 1] >> 57;
             t1[27 + i - 1] &= 0x1ffffffffffffffL;
@@ -5722,15 +11917,18 @@ static int sp_3072_div_27(const sp_digit* a, const sp_digit* d,
         t1[27 - 2] &= 0x1ffffffffffffffL;
         r1 = t1[27 - 1] / dv;
 
-        sp_3072_mul_d_27(t2, d, r1);
-        (void)sp_3072_sub_27(t1, t1, t2);
+        sp_3072_mul_d_27(t2, sd, r1);
+        sp_3072_sub_27(t1, t1, t2);
         XMEMCPY(r, t1, sizeof(*r) * 54U);
         for (i=0; i<26; i++) {
             r[i+1] += r[i] >> 57;
             r[i] &= 0x1ffffffffffffffL;
         }
-        sp_3072_cond_add_27(r, r, d, 0 - ((r[26] < 0) ?
+        sp_3072_cond_add_27(r, r, sd, 0 - ((r[26] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
+
+        sp_3072_norm_27(r);
+        sp_3072_rshift_27(r, r, 3);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -6035,13 +12233,25 @@ static int sp_3072_mod_exp_27(sp_digit* r, const sp_digit* a, const sp_digit* e,
         c -= 5;
         XMEMCPY(rt, t[y], sizeof(sp_digit) * 54);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (7 - c);
-                c += 57;
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 7;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 52;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 7;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 57 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_3072_mont_sqr_27(rt, rt, m, mp);
             sp_3072_mont_sqr_27(rt, rt, m, mp);
@@ -6079,13 +12289,6 @@ static int sp_3072_mod_exp_27(sp_digit* r, const sp_digit* a, const sp_digit* e,
 static void sp_3072_mont_norm_54(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=0; i<53; i++) {
-        r[i] = 0x1ffffffffffffffL;
-    }
-#else
     int i;
 
     for (i = 0; i < 48; i += 8) {
@@ -6103,7 +12306,6 @@ static void sp_3072_mont_norm_54(sp_digit* r, const sp_digit* m)
     r[50] = 0x1ffffffffffffffL;
     r[51] = 0x1ffffffffffffffL;
     r[52] = 0x1ffffffffffffffL;
-#endif
     r[53] = 0x7ffffffffffffL;
 
     /* r = (2^n - 1) mod n */
@@ -6123,13 +12325,6 @@ static void sp_3072_mont_norm_54(sp_digit* r, const sp_digit* m)
 static sp_digit sp_3072_cmp_54(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=53; i>=0; i--) {
-        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#else
     int i;
 
     r |= (a[53] - b[53]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
@@ -6148,7 +12343,6 @@ static sp_digit sp_3072_cmp_54(const sp_digit* a, const sp_digit* b)
         r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -6164,13 +12358,6 @@ static sp_digit sp_3072_cmp_54(const sp_digit* a, const sp_digit* b)
 static void sp_3072_cond_sub_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 54; i++) {
-        r[i] = a[i] - (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 48; i += 8) {
@@ -6189,7 +12376,6 @@ static void sp_3072_cond_sub_54(sp_digit* r, const sp_digit* a,
     r[51] = a[51] - (b[51] & m);
     r[52] = a[52] - (b[52] & m);
     r[53] = a[53] - (b[53] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -6201,20 +12387,8 @@ static void sp_3072_cond_sub_54(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_3072_mul_add_54(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 54; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1ffffffffffffffL;
-        t >>= 57;
-    }
-    r[54] += (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t[8];
+    sp_int128 tb = b;
+    sp_int128 t[8];
     int i;
 
     t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1ffffffffffffffL);
@@ -6247,39 +12421,6 @@ SP_NOINLINE static void sp_3072_mul_add_54(sp_digit* r, const sp_digit* a,
     t[5] = tb * a[53];
     r[53] += (sp_digit)((t[4] >> 57) + (t[5] & 0x1ffffffffffffffL));
     r[54] +=  (sp_digit)(t[5] >> 57);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 57.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_3072_norm_54(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 53; i++) {
-        a[i+1] += a[i] >> 57;
-        a[i] &= 0x1ffffffffffffffL;
-    }
-#else
-    int i;
-    for (i = 0; i < 48; i += 8) {
-        a[i+1] += a[i+0] >> 57; a[i+0] &= 0x1ffffffffffffffL;
-        a[i+2] += a[i+1] >> 57; a[i+1] &= 0x1ffffffffffffffL;
-        a[i+3] += a[i+2] >> 57; a[i+2] &= 0x1ffffffffffffffL;
-        a[i+4] += a[i+3] >> 57; a[i+3] &= 0x1ffffffffffffffL;
-        a[i+5] += a[i+4] >> 57; a[i+4] &= 0x1ffffffffffffffL;
-        a[i+6] += a[i+5] >> 57; a[i+5] &= 0x1ffffffffffffffL;
-        a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
-        a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
-    }
-    a[48+1] += a[48] >> 57; a[48] &= 0x1ffffffffffffffL;
-    a[49+1] += a[49] >> 57; a[49] &= 0x1ffffffffffffffL;
-    a[50+1] += a[50] >> 57; a[50] &= 0x1ffffffffffffffL;
-    a[51+1] += a[51] >> 57; a[51] &= 0x1ffffffffffffffL;
-    a[52+1] += a[52] >> 57; a[52] &= 0x1ffffffffffffffL;
-#endif
 }
 
 /* Shift the result in the high 3072 bits down to the bottom.
@@ -6289,46 +12430,33 @@ static void sp_3072_norm_54(sp_digit* a)
  */
 static void sp_3072_mont_shift_54(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
-    int128_t n = a[53] >> 51;
-    n += ((int128_t)a[54]) << 6;
-
-    for (i = 0; i < 53; i++) {
-        r[i] = n & 0x1ffffffffffffffL;
-        n >>= 57;
-        n += ((int128_t)a[55 + i]) << 6;
-    }
-    r[53] = (sp_digit)n;
-#else
-    int i;
-    int128_t n = a[53] >> 51;
-    n += ((int128_t)a[54]) << 6;
+    sp_int128 n = a[53] >> 51;
+    n += ((sp_int128)a[54]) << 6;
     for (i = 0; i < 48; i += 8) {
         r[i + 0] = n & 0x1ffffffffffffffL;
-        n >>= 57; n += ((int128_t)a[i + 55]) << 6;
+        n >>= 57; n += ((sp_int128)a[i + 55]) << 6;
         r[i + 1] = n & 0x1ffffffffffffffL;
-        n >>= 57; n += ((int128_t)a[i + 56]) << 6;
+        n >>= 57; n += ((sp_int128)a[i + 56]) << 6;
         r[i + 2] = n & 0x1ffffffffffffffL;
-        n >>= 57; n += ((int128_t)a[i + 57]) << 6;
+        n >>= 57; n += ((sp_int128)a[i + 57]) << 6;
         r[i + 3] = n & 0x1ffffffffffffffL;
-        n >>= 57; n += ((int128_t)a[i + 58]) << 6;
+        n >>= 57; n += ((sp_int128)a[i + 58]) << 6;
         r[i + 4] = n & 0x1ffffffffffffffL;
-        n >>= 57; n += ((int128_t)a[i + 59]) << 6;
+        n >>= 57; n += ((sp_int128)a[i + 59]) << 6;
         r[i + 5] = n & 0x1ffffffffffffffL;
-        n >>= 57; n += ((int128_t)a[i + 60]) << 6;
+        n >>= 57; n += ((sp_int128)a[i + 60]) << 6;
         r[i + 6] = n & 0x1ffffffffffffffL;
-        n >>= 57; n += ((int128_t)a[i + 61]) << 6;
+        n >>= 57; n += ((sp_int128)a[i + 61]) << 6;
         r[i + 7] = n & 0x1ffffffffffffffL;
-        n >>= 57; n += ((int128_t)a[i + 62]) << 6;
+        n >>= 57; n += ((sp_int128)a[i + 62]) << 6;
     }
-    r[48] = n & 0x1ffffffffffffffL; n >>= 57; n += ((int128_t)a[103]) << 6;
-    r[49] = n & 0x1ffffffffffffffL; n >>= 57; n += ((int128_t)a[104]) << 6;
-    r[50] = n & 0x1ffffffffffffffL; n >>= 57; n += ((int128_t)a[105]) << 6;
-    r[51] = n & 0x1ffffffffffffffL; n >>= 57; n += ((int128_t)a[106]) << 6;
-    r[52] = n & 0x1ffffffffffffffL; n >>= 57; n += ((int128_t)a[107]) << 6;
+    r[48] = n & 0x1ffffffffffffffL; n >>= 57; n += ((sp_int128)a[103]) << 6;
+    r[49] = n & 0x1ffffffffffffffL; n >>= 57; n += ((sp_int128)a[104]) << 6;
+    r[50] = n & 0x1ffffffffffffffL; n >>= 57; n += ((sp_int128)a[105]) << 6;
+    r[51] = n & 0x1ffffffffffffffL; n >>= 57; n += ((sp_int128)a[106]) << 6;
+    r[52] = n & 0x1ffffffffffffffL; n >>= 57; n += ((sp_int128)a[107]) << 6;
     r[53] = (sp_digit)n;
-#endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[54], 0, sizeof(*r) * 54U);
 }
 
@@ -6380,7 +12508,7 @@ static void sp_3072_mont_reduce_54(sp_digit* a, const sp_digit* m, sp_digit mp)
     a[i] &= 0x1ffffffffffffffL;
 #endif
     sp_3072_mont_shift_54(a, a);
-    sp_3072_cond_sub_54(a, a, m, 0 - (((a[53] >> 51) > 0) ?
+    sp_3072_cond_sub_54(a, a, m, 0 - (((a[53] - m[53]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
     sp_3072_norm_54(a);
 }
@@ -6415,6 +12543,46 @@ static void sp_3072_mont_sqr_54(sp_digit* r, const sp_digit* a,
     sp_3072_mont_reduce_54(r, m, mp);
 }
 
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_3072_mul_d_108(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    sp_digit t2;
+    sp_int128 p[4];
+    int i;
+
+    for (i = 0; i < 108; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 3] = (sp_digit)t2;
+    }
+    r[108] = (sp_digit)(t & 0x1ffffffffffffffL);
+}
+
 /* Conditionally add a and b using the mask m.
  * m is -1 to add and 0 when not.
  *
@@ -6426,13 +12594,6 @@ static void sp_3072_mont_sqr_54(sp_digit* r, const sp_digit* a,
 static void sp_3072_cond_add_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 54; i++) {
-        r[i] = a[i] + (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 48; i += 8) {
@@ -6451,7 +12612,29 @@ static void sp_3072_cond_add_54(sp_digit* r, const sp_digit* a,
     r[51] = a[51] + (b[51] & m);
     r[52] = a[52] + (b[52] & m);
     r[53] = a[53] + (b[53] & m);
-#endif /* WOLFSSL_SP_SMALL */
+}
+
+SP_NOINLINE static void sp_3072_rshift_54(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<48; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (57 - n)) & 0x1ffffffffffffffL);
+    }
+    r[48] = (a[48] >> n) | ((a[49] << (57 - n)) & 0x1ffffffffffffffL);
+    r[49] = (a[49] >> n) | ((a[50] << (57 - n)) & 0x1ffffffffffffffL);
+    r[50] = (a[50] >> n) | ((a[51] << (57 - n)) & 0x1ffffffffffffffL);
+    r[51] = (a[51] >> n) | ((a[52] << (57 - n)) & 0x1ffffffffffffffL);
+    r[52] = (a[52] >> n) | ((a[53] << (57 - n)) & 0x1ffffffffffffffL);
+    r[53] = a[53] >> n;
 }
 
 #ifdef WOLFSSL_SP_DIV_64
@@ -6546,7 +12729,7 @@ static WC_INLINE sp_digit sp_3072_div_word_54(sp_digit d1, sp_digit d0,
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Full implementation.
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -6554,40 +12737,45 @@ static WC_INLINE sp_digit sp_3072_div_word_54(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_3072_div_54(const sp_digit* a, const sp_digit* d, 
+static int sp_3072_div_54(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
+    sp_int128 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 54 + 1];
+    sp_digit t1[4 * 54 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 54 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 54 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
-    if (err == MP_OKAY) {
-        t2 = t1 + 2 * 54;
+    (void)m;
 
-        dv = d[53];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 54U);
-        for (i=53; i>=0; i--) {
-            t1[54 + i] += t1[54 + i - 1] >> 57;
-            t1[54 + i - 1] &= 0x1ffffffffffffffL;
+    if (err == MP_OKAY) {
+        t2 = t1 + 108 + 1;
+        sd = t2 + 54 + 1;
+
+        sp_3072_mul_d_54(sd, d, (sp_digit)1 << 6);
+        sp_3072_mul_d_108(t1, a, (sp_digit)1 << 6);
+        dv = sd[53];
+        t1[54 + 54] += t1[54 + 54 - 1] >> 57;
+        t1[54 + 54 - 1] &= 0x1ffffffffffffffL;
+        for (i=54; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_64
             d1 = t1[54 + i];
             d1 <<= 57;
@@ -6597,15 +12785,22 @@ static int sp_3072_div_54(const sp_digit* a, const sp_digit* d,
             r1 = sp_3072_div_word_54(t1[54 + i], t1[54 + i - 1], dv);
 #endif
 
-            sp_3072_mul_d_54(t2, d, r1);
+            sp_3072_mul_d_54(t2, sd, r1);
             (void)sp_3072_sub_54(&t1[i], &t1[i], t2);
             sp_3072_norm_54(&t1[i]);
             t1[54 + i] -= t2[54];
             t1[54 + i] += t1[54 + i - 1] >> 57;
             t1[54 + i - 1] &= 0x1ffffffffffffffL;
-            r1 = (((-t1[54 + i]) << 57) - t1[54 + i - 1]) / dv;
-            r1++;
-            sp_3072_mul_d_54(t2, d, r1);
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[54 + i];
+            d1 <<= 57;
+            d1 -= t1[54 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_3072_div_word_54(-t1[54 + i], -t1[54 + i - 1], dv);
+#endif
+            r1 -= t1[54 + i];
+            sp_3072_mul_d_54(t2, sd, r1);
             (void)sp_3072_add_54(&t1[i], &t1[i], t2);
             t1[54 + i] += t1[54 + i - 1] >> 57;
             t1[54 + i - 1] &= 0x1ffffffffffffffL;
@@ -6614,15 +12809,18 @@ static int sp_3072_div_54(const sp_digit* a, const sp_digit* d,
         t1[54 - 2] &= 0x1ffffffffffffffL;
         r1 = t1[54 - 1] / dv;
 
-        sp_3072_mul_d_54(t2, d, r1);
-        (void)sp_3072_sub_54(t1, t1, t2);
+        sp_3072_mul_d_54(t2, sd, r1);
+        sp_3072_sub_54(t1, t1, t2);
         XMEMCPY(r, t1, sizeof(*r) * 108U);
         for (i=0; i<53; i++) {
             r[i+1] += r[i] >> 57;
             r[i] &= 0x1ffffffffffffffL;
         }
-        sp_3072_cond_add_54(r, r, d, 0 - ((r[53] < 0) ?
+        sp_3072_cond_add_54(r, r, sd, 0 - ((r[53] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
+
+        sp_3072_norm_54(r);
+        sp_3072_rshift_54(r, r, 6);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -6834,9 +13032,9 @@ static int sp_3072_mod_exp_54(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[(32 * 108) + 108];
+    sp_digit td[(16 * 108) + 108];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm = NULL;
     sp_digit mp = 1;
@@ -6847,7 +13045,7 @@ static int sp_3072_mod_exp_54(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 108) + 108), NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 108) + 108), NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -6855,9 +13053,9 @@ static int sp_3072_mod_exp_54(sp_digit* r, const sp_digit* a, const sp_digit* e,
 
     if (err == MP_OKAY) {
         norm = td;
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 108;
-        rt = td + 3456;
+        rt = td + 1728;
 
         sp_3072_mont_setup(m, &mp);
         sp_3072_mont_norm_54(norm, m);
@@ -6890,24 +13088,8 @@ static int sp_3072_mod_exp_54(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_3072_mont_mul_54(t[13], t[ 7], t[ 6], m, mp);
         sp_3072_mont_sqr_54(t[14], t[ 7], m, mp);
         sp_3072_mont_mul_54(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_54(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_54(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_54(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_54(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_54(t[20], t[10], m, mp);
-        sp_3072_mont_mul_54(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_54(t[22], t[11], m, mp);
-        sp_3072_mont_mul_54(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_54(t[24], t[12], m, mp);
-        sp_3072_mont_mul_54(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_54(t[26], t[13], m, mp);
-        sp_3072_mont_mul_54(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_54(t[28], t[14], m, mp);
-        sp_3072_mont_mul_54(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_54(t[30], t[15], m, mp);
-        sp_3072_mont_mul_54(t[31], t[16], t[15], m, mp);
 
-        bits = ((bits + 4) / 5) * 5;
+        bits = ((bits + 3) / 4) * 4;
         i = ((bits + 56) / 57) - 1;
         c = bits % 57;
         if (c == 0) {
@@ -6920,28 +13102,39 @@ static int sp_3072_mod_exp_54(sp_digit* r, const sp_digit* a, const sp_digit* e,
             n = 0;
             i--;
         }
-        if (c < 5) {
+        if (c < 4) {
             n |= e[i--] << (7 - c);
             c += 57;
         }
-        y = (int)((n >> 59) & 0x1f);
-        n <<= 5;
-        c -= 5;
+        y = (int)((n >> 60) & 0xf);
+        n <<= 4;
+        c -= 4;
         XMEMCPY(rt, t[y], sizeof(sp_digit) * 108);
-        while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (7 - c);
-                c += 57;
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 7;
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c = 53;
+            }
+            else {
+                y = (byte)((n >> 60) & 0xf);
+                n = e[i--] << 7;
+                c = 4 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 57 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_3072_mont_sqr_54(rt, rt, m, mp);
             sp_3072_mont_sqr_54(rt, rt, m, mp);
             sp_3072_mont_sqr_54(rt, rt, m, mp);
             sp_3072_mont_sqr_54(rt, rt, m, mp);
-            sp_3072_mont_sqr_54(rt, rt, m, mp);
 
             sp_3072_mont_mul_54(rt, rt, t[y], m, mp);
         }
@@ -7071,7 +13264,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
         sp_3072_cond_sub_54(r, r, m, ((mp < 0) ?
                     (sp_digit)1 : (sp_digit)0)- 1);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_54(r, out);
         *outLen = 384;
     }
 
@@ -7184,7 +13377,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_54(r, out);
         *outLen = 384;
     }
 
@@ -7278,7 +13471,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_54(r, out);
         *outLen = 384;
     }
 
@@ -7351,7 +13544,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_54(r, out);
         *outLen = 384;
     }
 
@@ -7374,10 +13567,9 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[27 * 11];
+    sp_digit a[27 * 8];
 #endif
     sp_digit* p = NULL;
-    sp_digit* q = NULL;
     sp_digit* dp = NULL;
     sp_digit* dq = NULL;
     sp_digit* qi = NULL;
@@ -7406,31 +13598,31 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 27 * 11, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 27 * 8, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
     }
 #endif
     if (err == MP_OKAY) {
-        p = a + 54 * 2;
-        q = p + 27;
-        qi = dq = dp = q + 27;
+        p = a + 54;
+        qi = dq = dp = p + 27;
         tmpa = qi + 27;
         tmpb = tmpa + 54;
-        r = a + 54;
+        r = a;
 
         sp_3072_from_bin(a, 54, in, inLen);
         sp_3072_from_mp(p, 27, pm);
-        sp_3072_from_mp(q, 27, qm);
         sp_3072_from_mp(dp, 27, dpm);
         err = sp_3072_mod_exp_27(tmpa, a, dp, 1536, p, 1);
     }
     if (err == MP_OKAY) {
+        sp_3072_from_mp(p, 27, qm);
         sp_3072_from_mp(dq, 27, dqm);
-        err = sp_3072_mod_exp_27(tmpb, a, dq, 1536, q, 1);
+        err = sp_3072_mod_exp_27(tmpb, a, dq, 1536, p, 1);
     }
     if (err == MP_OKAY) {
+        sp_3072_from_mp(p, 27, pm);
         (void)sp_3072_sub_27(tmpa, tmpa, tmpb);
         sp_3072_norm_27(tmpa);
         sp_3072_cond_add_27(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[26] >> 63));
@@ -7442,11 +13634,12 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_mul_27(tmpa, q, tmpa);
+        sp_3072_from_mp(p, 27, qm);
+        sp_3072_mul_27(tmpa, p, tmpa);
         (void)sp_3072_add_54(r, tmpb, tmpa);
         sp_3072_norm_54(r);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_54(r, out);
         *outLen = 384;
     }
 
@@ -7454,7 +13647,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 27 * 11);
+        ForceZero(a, sizeof(sp_digit) * 27 * 8);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -7497,7 +13690,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 27 * 13, NULL, 
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 27 * 13, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -7541,7 +13734,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
         (void)sp_3072_add_54(r, tmpb, tmpa);
         sp_3072_norm_54(r);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_54(r, out);
         *outLen = 384;
     }
 
@@ -7778,14 +13971,6 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
 SP_NOINLINE static void sp_3072_lshift_54(sp_digit* r, const sp_digit* a,
         byte n)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    r[54] = a[53] >> (57 - n);
-    for (i=53; i>0; i--) {
-        r[i] = ((a[i] << n) | (a[i-1] >> (57 - n))) & 0x1ffffffffffffffL;
-    }
-#else
     sp_int_digit s;
     sp_int_digit t;
 
@@ -7897,7 +14082,6 @@ SP_NOINLINE static void sp_3072_lshift_54(sp_digit* r, const sp_digit* a,
     r[2] = ((s << n) | (t >> (57U - n))) & 0x1ffffffffffffffUL;
     s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
     r[1] = ((s << n) | (t >> (57U - n))) & 0x1ffffffffffffffUL;
-#endif
     r[0] = (a[0] << n) & 0x1ffffffffffffffL;
 }
 
@@ -7963,13 +14147,25 @@ static int sp_3072_mod_exp_2_54(sp_digit* r, const sp_digit* e, int bits, const
         c -= 5;
         sp_3072_lshift_54(r, norm, (byte)y);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (7 - c);
-                c += 57;
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 7;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 52;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 7;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 57 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_3072_mont_sqr_54(r, r, m, mp);
             sp_3072_mont_sqr_54(r, r, m, mp);
@@ -8019,80 +14215,6 @@ static int sp_3072_mod_exp_2_54(sp_digit* r, const sp_digit* e, int bits, const
 int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     const mp_int* mod, byte* out, word32* outLen)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int err = MP_OKAY;
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* b = NULL;
-#else
-    sp_digit b[54 * 4];
-#endif
-    sp_digit* e = NULL;
-    sp_digit* m = NULL;
-    sp_digit* r = NULL;
-    word32 i;
-
-    if (mp_count_bits(base) > 3072) {
-        err = MP_READ_E;
-    }
-    else if (expLen > 384) {
-        err = MP_READ_E;
-    }
-    else if (mp_count_bits(mod) != 3072) {
-        err = MP_READ_E;
-    }
-    else if (mp_iseven(mod)) {
-        err = MP_VAL;
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 54 * 4, NULL, DYNAMIC_TYPE_DH);
-        if (b == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        e = b + 54 * 2;
-        m = e + 54;
-        r = b;
-
-        sp_3072_from_mp(b, 54, base);
-        sp_3072_from_bin(e, 54, exp, expLen);
-        sp_3072_from_mp(m, 54, mod);
-
-    #ifdef HAVE_FFDHE_3072
-        if (base->used == 1 && base->dp[0] == 2 &&
-                (m[53] >> 19) == 0xffffffffL) {
-            err = sp_3072_mod_exp_2_54(r, e, expLen * 8, m);
-        }
-        else
-    #endif
-            err = sp_3072_mod_exp_54(r, b, e, expLen * 8, m, 0);
-    }
-
-    if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
-        *outLen = 384;
-        for (i=0; i<384 && out[i] == 0; i++) {
-        }
-        *outLen -= i;
-        XMEMMOVE(out, out + i, *outLen);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (b != NULL)
-#endif
-    {
-        /* only "e" is sensitive and needs zeroized */
-        if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 54U);
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        XFREE(b, NULL, DYNAMIC_TYPE_DH);
-    #endif
-    }
-    return err;
-#else
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
@@ -8149,7 +14271,7 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_54(r, out);
         *outLen = 384;
         for (i=0; i<384U && out[i] == 0U; i++) {
             /* Search for first non-zero. */
@@ -8171,7 +14293,6 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     return err;
-#endif
 }
 #endif /* WOLFSSL_HAVE_SP_DH */
 
@@ -8319,9 +14440,3125 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
 
 #endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
 
+#endif /* WOLFSSL_SP_SMALL */
 #endif /* !WOLFSSL_SP_NO_3072 */
 
 #ifdef WOLFSSL_SP_4096
+#ifdef WOLFSSL_SP_SMALL
+/* Read big endian unsigned byte array into r.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  Byte array.
+ * n  Number of bytes in array to read.
+ */
+static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
+{
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = n-1; i >= 0; i--) {
+        r[j] |= (((sp_digit)a[i]) << s);
+        if (s >= 51U) {
+            r[j] &= 0x7ffffffffffffffL;
+            s = 59U - s;
+            if (j + 1 >= size) {
+                break;
+            }
+            r[++j] = (sp_digit)a[i] >> s;
+            s = 8U - s;
+        }
+        else {
+            s += 8U;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+}
+
+/* Convert an mp_int to an array of sp_digit.
+ *
+ * r  A single precision integer.
+ * size  Maximum number of bytes to convert
+ * a  A multi-precision integer.
+ */
+static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
+{
+#if DIGIT_BIT == 59
+    int j;
+
+    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);
+
+    for (j = a->used; j < size; j++) {
+        r[j] = 0;
+    }
+#elif DIGIT_BIT > 59
+    int i;
+    int j = 0;
+    word32 s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i] << s);
+        r[j] &= 0x7ffffffffffffffL;
+        s = 59U - s;
+        if (j + 1 >= size) {
+            break;
+        }
+        /* lint allow cast of mismatch word32 and mp_digit */
+        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+        while ((s + 59U) <= (word32)DIGIT_BIT) {
+            s += 59U;
+            r[j] &= 0x7ffffffffffffffL;
+            if (j + 1 >= size) {
+                break;
+            }
+            if (s < (word32)DIGIT_BIT) {
+                /* lint allow cast of mismatch word32 and mp_digit */
+                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
+            }
+            else {
+                r[++j] = (sp_digit)0;
+            }
+        }
+        s = (word32)DIGIT_BIT - s;
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#else
+    int i;
+    int j = 0;
+    int s = 0;
+
+    r[0] = 0;
+    for (i = 0; i < a->used && j < size; i++) {
+        r[j] |= ((sp_digit)a->dp[i]) << s;
+        if (s + DIGIT_BIT >= 59) {
+            r[j] &= 0x7ffffffffffffffL;
+            if (j + 1 >= size) {
+                break;
+            }
+            s = 59 - s;
+            if (s == DIGIT_BIT) {
+                r[++j] = 0;
+                s = 0;
+            }
+            else {
+                r[++j] = a->dp[i] >> s;
+                s = DIGIT_BIT - s;
+            }
+        }
+        else {
+            s += DIGIT_BIT;
+        }
+    }
+
+    for (j++; j < size; j++) {
+        r[j] = 0;
+    }
+#endif
+}
+
+/* Write r as big endian to byte array.
+ * Fixed length number of bytes written: 512
+ *
+ * r  A single precision integer.
+ * a  Byte array.
+ */
+static void sp_4096_to_bin_70(sp_digit* r, byte* a)
+{
+    int i;
+    int j;
+    int s = 0;
+    int b;
+
+    for (i=0; i<69; i++) {
+        r[i+1] += r[i] >> 59;
+        r[i] &= 0x7ffffffffffffffL;
+    }
+    j = 4096 / 8 - 1;
+    a[j] = 0;
+    for (i=0; i<70 && j>=0; i++) {
+        b = 0;
+        /* lint allow cast of mismatch sp_digit and int */
+        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
+        b += 8 - s;
+        if (j < 0) {
+            break;
+        }
+        while (b < 59) {
+            a[j--] = (byte)(r[i] >> b);
+            b += 8;
+            if (j < 0) {
+                break;
+            }
+        }
+        s = 8 - (b - 59);
+        if (j >= 0) {
+            a[j] = 0;
+        }
+        if (s != 0) {
+            j++;
+        }
+    }
+}
+
+#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
+/* Normalize the values in each word to 59 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_35(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 34; i++) {
+        a[i+1] += a[i] >> 59;
+        a[i] &= 0x7ffffffffffffffL;
+    }
+}
+
+#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
+/* Normalize the values in each word to 59 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_70(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 69; i++) {
+        a[i+1] += a[i] >> 59;
+        a[i] &= 0x7ffffffffffffffL;
+    }
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_mul_70(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 lo;
+
+    c = ((sp_uint128)a[69]) * b[69];
+    r[139] = (sp_digit)(c >> 59);
+    c &= 0x7ffffffffffffffL;
+    for (k = 137; k >= 0; k--) {
+        if (k >= 70) {
+            i = k - 69;
+            imax = 69;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint128)a[i]) * b[k - i];
+        }
+        c += lo >> 59;
+        r[k + 2] += (sp_digit)(c >> 59);
+        r[k + 1]  = (sp_digit)(c & 0x7ffffffffffffffL);
+        c = lo & 0x7ffffffffffffffL;
+    }
+    r[0] = (sp_digit)c;
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_sqr_70(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 t;
+
+    c = ((sp_uint128)a[69]) * a[69];
+    r[139] = (sp_digit)(c >> 59);
+    c = (c & 0x7ffffffffffffffL) << 59;
+    for (k = 137; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
+        }
+        if (k < 69) {
+            imax = k;
+        }
+        else {
+            imax = 69;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint128)a[i]) * a[k - i];
+        }
+        c += t * 2;
+
+        r[k + 2] += (sp_digit) (c >> 118);
+        r[k + 1]  = (sp_digit)((c >> 59) & 0x7ffffffffffffffL);
+        c = (c & 0x7ffffffffffffffL) << 59;
+    }
+    r[0] = (sp_digit)(c >> 59);
+}
+
+/* Caclulate the bottom digit of -1/a mod 2^n.
+ *
+ * a    A single precision number.
+ * rho  Bottom word of inverse.
+ */
+static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
+{
+    sp_digit x;
+    sp_digit b;
+
+    b = a[0];
+    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+    x &= 0x7ffffffffffffffL;
+
+    /* rho = -1/m mod b */
+    *rho = ((sp_digit)1 << 59) - x;
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_d_70(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 70; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x7ffffffffffffffL);
+        t >>= 59;
+    }
+    r[70] = (sp_digit)t;
+}
+
+#if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
+#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_sub_35(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 35; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 4096 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_4096_mont_norm_35(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i=0; i<34; i++) {
+        r[i] = 0x7ffffffffffffffL;
+    }
+    r[34] = 0x3ffffffffffL;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_4096_sub_35(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_4096_cmp_35(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i=34; i>=0; i--) {
+        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_4096_cond_sub_35(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 35; i++) {
+        r[i] = a[i] - (b[i] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_add_35(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t[4];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 32; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x7ffffffffffffffL;
+        t[1] += t[0] >> 59;
+        r[i+1] = t[1] & 0x7ffffffffffffffL;
+        t[2] += t[1] >> 59;
+        r[i+2] = t[2] & 0x7ffffffffffffffL;
+        t[3] += t[2] >> 59;
+        r[i+3] = t[3] & 0x7ffffffffffffffL;
+        t[0]  = t[3] >> 59;
+    }
+    t[0] += (tb * a[32]) + r[32];
+    t[1]  = (tb * a[33]) + r[33];
+    t[2]  = (tb * a[34]) + r[34];
+    r[32] = t[0] & 0x7ffffffffffffffL;
+    t[1] += t[0] >> 59;
+    r[33] = t[1] & 0x7ffffffffffffffL;
+    t[2] += t[1] >> 59;
+    r[34] = t[2] & 0x7ffffffffffffffL;
+    r[35] +=  (sp_digit)(t[2] >> 59);
+}
+
+/* Shift the result in the high 2048 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_4096_mont_shift_35(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int128 n = a[34] >> 42;
+    n += ((sp_int128)a[35]) << 17;
+
+    for (i = 0; i < 34; i++) {
+        r[i] = n & 0x7ffffffffffffffL;
+        n >>= 59;
+        n += ((sp_int128)a[36 + i]) << 17;
+    }
+    r[34] = (sp_digit)n;
+    XMEMSET(&r[35], 0, sizeof(*r) * 35U);
+}
+
+/* Reduce the number back to 4096 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_4096_mont_reduce_35(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_4096_norm_35(a + 35);
+
+    for (i=0; i<34; i++) {
+        mu = (a[i] * mp) & 0x7ffffffffffffffL;
+        sp_4096_mul_add_35(a+i, m, mu);
+        a[i+1] += a[i] >> 59;
+    }
+    mu = (a[i] * mp) & 0x3ffffffffffL;
+    sp_4096_mul_add_35(a+i, m, mu);
+    a[i+1] += a[i] >> 59;
+    a[i] &= 0x7ffffffffffffffL;
+    sp_4096_mont_shift_35(a, a);
+    sp_4096_cond_sub_35(a, a, m, 0 - (((a[34] - m[34]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_4096_norm_35(a);
+}
+
+/* Multiply a and b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_mul_35(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 lo;
+
+    c = ((sp_uint128)a[34]) * b[34];
+    r[69] = (sp_digit)(c >> 59);
+    c &= 0x7ffffffffffffffL;
+    for (k = 67; k >= 0; k--) {
+        if (k >= 35) {
+            i = k - 34;
+            imax = 34;
+        }
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint128)a[i]) * b[k - i];
+        }
+        c += lo >> 59;
+        r[k + 2] += (sp_digit)(c >> 59);
+        r[k + 1]  = (sp_digit)(c & 0x7ffffffffffffffL);
+        c = lo & 0x7ffffffffffffffL;
+    }
+    r[0] = (sp_digit)c;
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_4096_mont_mul_35(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_4096_mul_35(r, a, b);
+    sp_4096_mont_reduce_35(r, m, mp);
+}
+
+/* Square a and put result in r. (r = a * a)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ */
+SP_NOINLINE static void sp_4096_sqr_35(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    int imax;
+    int k;
+    sp_uint128 c;
+    sp_uint128 t;
+
+    c = ((sp_uint128)a[34]) * a[34];
+    r[69] = (sp_digit)(c >> 59);
+    c = (c & 0x7ffffffffffffffL) << 59;
+    for (k = 67; k >= 0; k--) {
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
+        }
+        if (k < 34) {
+            imax = k;
+        }
+        else {
+            imax = 34;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint128)a[i]) * a[k - i];
+        }
+        c += t * 2;
+
+        r[k + 2] += (sp_digit) (c >> 118);
+        r[k + 1]  = (sp_digit)((c >> 59) & 0x7ffffffffffffffL);
+        c = (c & 0x7ffffffffffffffL) << 59;
+    }
+    r[0] = (sp_digit)(c >> 59);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_4096_mont_sqr_35(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_4096_sqr_35(r, a);
+    sp_4096_mont_reduce_35(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_d_35(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 35; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x7ffffffffffffffL);
+        t >>= 59;
+    }
+    r[35] = (sp_digit)t;
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_4096_cond_add_35(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 35; i++) {
+        r[i] = a[i] + (b[i] & m);
+    }
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_35(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 35; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_4096_rshift_35(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<34; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (59 - n))) & 0x7ffffffffffffffL;
+    }
+    r[34] = a[34] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_64
+static WC_INLINE sp_digit sp_4096_div_word_35(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 59 bits from d1 and top 4 bits from d0. */
+    d = (d1 << 4) + (d0 >> 55);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 5 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 51) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 9 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 47) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 43) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 17 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 39) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 35) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 31) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 27) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 33 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 23) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 37 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 19) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 41 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 15) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 45 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 11) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 49 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 7) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 53 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 3) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 57 bits in r */
+    /* Remaining 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += d0 & ((1 << 3) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 59 bits from d1 and top 4 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_64 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_4096_div_35(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_64
+    sp_int128 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 35 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 35 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 70 + 1;
+        sd = t2 + 35 + 1;
+
+        sp_4096_mul_d_35(sd, d, (sp_digit)1 << 17);
+        sp_4096_mul_d_70(t1, a, (sp_digit)1 << 17);
+        dv = sd[34];
+        t1[35 + 35] += t1[35 + 35 - 1] >> 59;
+        t1[35 + 35 - 1] &= 0x7ffffffffffffffL;
+        for (i=35; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = t1[35 + i];
+            d1 <<= 59;
+            d1 += t1[35 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_35(t1[35 + i], t1[35 + i - 1], dv);
+#endif
+
+            sp_4096_mul_d_35(t2, sd, r1);
+            (void)sp_4096_sub_35(&t1[i], &t1[i], t2);
+            sp_4096_norm_35(&t1[i]);
+            t1[35 + i] -= t2[35];
+            t1[35 + i] += t1[35 + i - 1] >> 59;
+            t1[35 + i - 1] &= 0x7ffffffffffffffL;
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[35 + i];
+            d1 <<= 59;
+            d1 -= t1[35 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_35(-t1[35 + i], -t1[35 + i - 1], dv);
+#endif
+            r1 -= t1[35 + i];
+            sp_4096_mul_d_35(t2, sd, r1);
+            (void)sp_4096_add_35(&t1[i], &t1[i], t2);
+            t1[35 + i] += t1[35 + i - 1] >> 59;
+            t1[35 + i - 1] &= 0x7ffffffffffffffL;
+        }
+        t1[35 - 1] += t1[35 - 2] >> 59;
+        t1[35 - 2] &= 0x7ffffffffffffffL;
+        r1 = t1[35 - 1] / dv;
+
+        sp_4096_mul_d_35(t2, sd, r1);
+        sp_4096_sub_35(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 70U);
+        for (i=0; i<34; i++) {
+            r[i+1] += r[i] >> 59;
+            r[i] &= 0x7ffffffffffffffL;
+        }
+        sp_4096_cond_add_35(r, r, sd, 0 - ((r[34] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_4096_norm_35(r);
+        sp_4096_rshift_35(r, r, 17);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_4096_mod_35(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_4096_div_35(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_35(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 70];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 35 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 35 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 35U * 2U);
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_35(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_35(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 35U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_4096_mul_35(t[1], t[1], norm);
+        err = sp_4096_mod_35(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 59;
+        c = bits % 59;
+        n = e[i--] << (59 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 59;
+            }
+
+            y = (int)((n >> 58) & 1);
+            n <<= 1;
+
+            sp_4096_mont_mul_35(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 35 * 2);
+            sp_4096_mont_sqr_35(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 35 * 2);
+        }
+
+        sp_4096_mont_reduce_35(t[0], m, mp);
+        n = sp_4096_cmp_35(t[0], m);
+        sp_4096_cond_sub_35(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 35 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 70];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 35 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 35 * 2);
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_35(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_35(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_35(t[1], t[1], norm);
+                err = sp_4096_mod_35(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_4096_mul_35(t[1], a, norm);
+            err = sp_4096_mod_35(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 59;
+        c = bits % 59;
+        n = e[i--] << (59 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 59;
+            }
+
+            y = (int)((n >> 58) & 1);
+            n <<= 1;
+
+            sp_4096_mont_mul_35(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 35 * 2);
+            sp_4096_mont_sqr_35(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 35 * 2);
+        }
+
+        sp_4096_mont_reduce_35(t[0], m, mp);
+        n = sp_4096_cmp_35(t[0], m);
+        sp_4096_cond_sub_35(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 35 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(32 * 70) + 70];
+#endif
+    sp_digit* t[32];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 70) + 70), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<32; i++)
+            t[i] = td + i * 70;
+        rt = td + 2240;
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_35(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_35(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_35(t[1], t[1], norm);
+                err = sp_4096_mod_35(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_4096_mul_35(t[1], a, norm);
+            err = sp_4096_mod_35(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mont_sqr_35(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_35(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_35(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_35(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_35(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_35(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_4096_mont_sqr_35(t[ 8], t[ 4], m, mp);
+        sp_4096_mont_mul_35(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_4096_mont_sqr_35(t[10], t[ 5], m, mp);
+        sp_4096_mont_mul_35(t[11], t[ 6], t[ 5], m, mp);
+        sp_4096_mont_sqr_35(t[12], t[ 6], m, mp);
+        sp_4096_mont_mul_35(t[13], t[ 7], t[ 6], m, mp);
+        sp_4096_mont_sqr_35(t[14], t[ 7], m, mp);
+        sp_4096_mont_mul_35(t[15], t[ 8], t[ 7], m, mp);
+        sp_4096_mont_sqr_35(t[16], t[ 8], m, mp);
+        sp_4096_mont_mul_35(t[17], t[ 9], t[ 8], m, mp);
+        sp_4096_mont_sqr_35(t[18], t[ 9], m, mp);
+        sp_4096_mont_mul_35(t[19], t[10], t[ 9], m, mp);
+        sp_4096_mont_sqr_35(t[20], t[10], m, mp);
+        sp_4096_mont_mul_35(t[21], t[11], t[10], m, mp);
+        sp_4096_mont_sqr_35(t[22], t[11], m, mp);
+        sp_4096_mont_mul_35(t[23], t[12], t[11], m, mp);
+        sp_4096_mont_sqr_35(t[24], t[12], m, mp);
+        sp_4096_mont_mul_35(t[25], t[13], t[12], m, mp);
+        sp_4096_mont_sqr_35(t[26], t[13], m, mp);
+        sp_4096_mont_mul_35(t[27], t[14], t[13], m, mp);
+        sp_4096_mont_sqr_35(t[28], t[14], m, mp);
+        sp_4096_mont_mul_35(t[29], t[15], t[14], m, mp);
+        sp_4096_mont_sqr_35(t[30], t[15], m, mp);
+        sp_4096_mont_mul_35(t[31], t[16], t[15], m, mp);
+
+        bits = ((bits + 4) / 5) * 5;
+        i = ((bits + 58) / 59) - 1;
+        c = bits % 59;
+        if (c == 0) {
+            c = 59;
+        }
+        if (i < 35) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 5) {
+            n |= e[i--] << (5 - c);
+            c += 59;
+        }
+        y = (int)((n >> 59) & 0x1f);
+        n <<= 5;
+        c -= 5;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 70);
+        while ((i >= 0) || (c >= 5)) {
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 5;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 54;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 5;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 59 - c;
+            }
+
+            sp_4096_mont_sqr_35(rt, rt, m, mp);
+            sp_4096_mont_sqr_35(rt, rt, m, mp);
+            sp_4096_mont_sqr_35(rt, rt, m, mp);
+            sp_4096_mont_sqr_35(rt, rt, m, mp);
+            sp_4096_mont_sqr_35(rt, rt, m, mp);
+
+            sp_4096_mont_mul_35(rt, rt, t[y], m, mp);
+        }
+
+        sp_4096_mont_reduce_35(rt, m, mp);
+        n = sp_4096_cmp_35(rt, m);
+        sp_4096_cond_sub_35(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 70);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
+#endif /* (WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH) & !WOLFSSL_RSA_PUBLIC_ONLY */
+
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_sub_70(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 70; i++) {
+        r[i] = a[i] - b[i];
+    }
+
+    return 0;
+}
+
+/* r = 2^n mod m where n is the number of bits to reduce by.
+ * Given m must be 4096 bits, just need to subtract.
+ *
+ * r  A single precision number.
+ * m  A single precision number.
+ */
+static void sp_4096_mont_norm_70(sp_digit* r, const sp_digit* m)
+{
+    /* Set r = 2^n - 1. */
+    int i;
+
+    for (i=0; i<69; i++) {
+        r[i] = 0x7ffffffffffffffL;
+    }
+    r[69] = 0x1ffffffL;
+
+    /* r = (2^n - 1) mod n */
+    (void)sp_4096_sub_70(r, r, m);
+
+    /* Add one so r = 2^n mod m */
+    r[0] += 1;
+}
+
+/* Compare a with b in constant time.
+ *
+ * a  A single precision integer.
+ * b  A single precision integer.
+ * return -ve, 0 or +ve if a is less than, equal to or greater than b
+ * respectively.
+ */
+static sp_digit sp_4096_cmp_70(const sp_digit* a, const sp_digit* b)
+{
+    sp_digit r = 0;
+    int i;
+
+    for (i=69; i>=0; i--) {
+        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
+    }
+
+    return r;
+}
+
+/* Conditionally subtract b from a using the mask m.
+ * m is -1 to subtract and 0 when not.
+ *
+ * r  A single precision number representing condition subtract result.
+ * a  A single precision number to subtract from.
+ * b  A single precision number to subtract.
+ * m  Mask value to apply.
+ */
+static void sp_4096_cond_sub_70(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 70; i++) {
+        r[i] = a[i] - (b[i] & m);
+    }
+}
+
+/* Mul a by scalar b and add into r. (r += a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_add_70(sp_digit* r, const sp_digit* a,
+        const sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t[4];
+    int i;
+
+    t[0] = 0;
+    for (i = 0; i < 68; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x7ffffffffffffffL;
+        t[1] += t[0] >> 59;
+        r[i+1] = t[1] & 0x7ffffffffffffffL;
+        t[2] += t[1] >> 59;
+        r[i+2] = t[2] & 0x7ffffffffffffffL;
+        t[3] += t[2] >> 59;
+        r[i+3] = t[3] & 0x7ffffffffffffffL;
+        t[0]  = t[3] >> 59;
+    }
+    t[0] += (tb * a[68]) + r[68];
+    t[1]  = (tb * a[69]) + r[69];
+    r[68] = t[0] & 0x7ffffffffffffffL;
+    t[1] += t[0] >> 59;
+    r[69] = t[1] & 0x7ffffffffffffffL;
+    r[70] +=  (sp_digit)(t[1] >> 59);
+}
+
+/* Shift the result in the high 4096 bits down to the bottom.
+ *
+ * r  A single precision number.
+ * a  A single precision number.
+ */
+static void sp_4096_mont_shift_70(sp_digit* r, const sp_digit* a)
+{
+    int i;
+    sp_int128 n = a[69] >> 25;
+    n += ((sp_int128)a[70]) << 34;
+
+    for (i = 0; i < 69; i++) {
+        r[i] = n & 0x7ffffffffffffffL;
+        n >>= 59;
+        n += ((sp_int128)a[71 + i]) << 34;
+    }
+    r[69] = (sp_digit)n;
+    XMEMSET(&r[70], 0, sizeof(*r) * 70U);
+}
+
+/* Reduce the number back to 4096 bits using Montgomery reduction.
+ *
+ * a   A single precision number to reduce in place.
+ * m   The single precision number representing the modulus.
+ * mp  The digit representing the negative inverse of m mod 2^n.
+ */
+static void sp_4096_mont_reduce_70(sp_digit* a, const sp_digit* m, sp_digit mp)
+{
+    int i;
+    sp_digit mu;
+
+    sp_4096_norm_70(a + 70);
+
+#ifdef WOLFSSL_SP_DH
+    if (mp != 1) {
+        for (i=0; i<69; i++) {
+            mu = (a[i] * mp) & 0x7ffffffffffffffL;
+            sp_4096_mul_add_70(a+i, m, mu);
+            a[i+1] += a[i] >> 59;
+        }
+        mu = (a[i] * mp) & 0x1ffffffL;
+        sp_4096_mul_add_70(a+i, m, mu);
+        a[i+1] += a[i] >> 59;
+        a[i] &= 0x7ffffffffffffffL;
+    }
+    else {
+        for (i=0; i<69; i++) {
+            mu = a[i] & 0x7ffffffffffffffL;
+            sp_4096_mul_add_70(a+i, m, mu);
+            a[i+1] += a[i] >> 59;
+        }
+        mu = a[i] & 0x1ffffffL;
+        sp_4096_mul_add_70(a+i, m, mu);
+        a[i+1] += a[i] >> 59;
+        a[i] &= 0x7ffffffffffffffL;
+    }
+#else
+    for (i=0; i<69; i++) {
+        mu = (a[i] * mp) & 0x7ffffffffffffffL;
+        sp_4096_mul_add_70(a+i, m, mu);
+        a[i+1] += a[i] >> 59;
+    }
+    mu = (a[i] * mp) & 0x1ffffffL;
+    sp_4096_mul_add_70(a+i, m, mu);
+    a[i+1] += a[i] >> 59;
+    a[i] &= 0x7ffffffffffffffL;
+#endif
+    sp_4096_mont_shift_70(a, a);
+    sp_4096_cond_sub_70(a, a, m, 0 - (((a[69] - m[69]) > 0) ?
+            (sp_digit)1 : (sp_digit)0));
+    sp_4096_norm_70(a);
+}
+
+/* Multiply two Montogmery form numbers mod the modulus (prime).
+ * (r = a * b mod m)
+ *
+ * r   Result of multiplication.
+ * a   First number to multiply in Montogmery form.
+ * b   Second number to multiply in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_4096_mont_mul_70(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit* m, sp_digit mp)
+{
+    sp_4096_mul_70(r, a, b);
+    sp_4096_mont_reduce_70(r, m, mp);
+}
+
+/* Square the Montgomery form number. (r = a * a mod m)
+ *
+ * r   Result of squaring.
+ * a   Number to square in Montogmery form.
+ * m   Modulus (prime).
+ * mp  Montogmery mulitplier.
+ */
+static void sp_4096_mont_sqr_70(sp_digit* r, const sp_digit* a,
+        const sp_digit* m, sp_digit mp)
+{
+    sp_4096_sqr_70(r, a);
+    sp_4096_mont_reduce_70(r, m, mp);
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_4096_mul_d_140(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 140; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x7ffffffffffffffL);
+        t >>= 59;
+    }
+    r[140] = (sp_digit)t;
+}
+
+/* Conditionally add a and b using the mask m.
+ * m is -1 to add and 0 when not.
+ *
+ * r  A single precision number representing conditional add result.
+ * a  A single precision number to add with.
+ * b  A single precision number to add.
+ * m  Mask value to apply.
+ */
+static void sp_4096_cond_add_70(sp_digit* r, const sp_digit* a,
+        const sp_digit* b, const sp_digit m)
+{
+    int i;
+
+    for (i = 0; i < 35; i++) {
+        r[i] = a[i] + (b[i] & m);
+    }
+}
+
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_4096_add_70(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 70; i++) {
+        r[i] = a[i] + b[i];
+    }
+
+    return 0;
+}
+
+SP_NOINLINE static void sp_4096_rshift_70(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    for (i=0; i<69; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (59 - n))) & 0x7ffffffffffffffL;
+    }
+    r[69] = a[69] >> n;
+}
+
+#ifdef WOLFSSL_SP_DIV_64
+static WC_INLINE sp_digit sp_4096_div_word_70(sp_digit d1, sp_digit d0,
+    sp_digit dv)
+{
+    sp_digit d;
+    sp_digit r;
+    sp_digit t;
+
+    /* All 59 bits from d1 and top 4 bits from d0. */
+    d = (d1 << 4) + (d0 >> 55);
+    r = d / dv;
+    d -= r * dv;
+    /* Up to 5 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 51) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 9 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 47) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 13 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 43) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 17 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 39) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 21 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 35) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 25 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 31) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 29 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 27) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 33 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 23) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 37 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 19) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 41 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 15) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 45 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 11) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 49 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 7) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 53 bits in r */
+    /* Next 4 bits from d0. */
+    r <<= 4;
+    d <<= 4;
+    d += (d0 >> 3) & ((1 << 4) - 1);
+    t = d / dv;
+    d -= t * dv;
+    r += t;
+    /* Up to 57 bits in r */
+    /* Remaining 3 bits from d0. */
+    r <<= 3;
+    d <<= 3;
+    d += d0 & ((1 << 3) - 1);
+    t = d / dv;
+    r += t;
+
+    /* All 59 bits from d1 and top 4 bits from d0. */
+    return r;
+}
+#endif /* WOLFSSL_SP_DIV_64 */
+
+/* Divide d in a and put remainder into r (m*d + r = a)
+ * m is not calculated as it is not needed at this time.
+ *
+ * Full implementation.
+ *
+ * a  Number to be divided.
+ * d  Number to divide with.
+ * m  Multiplier result.
+ * r  Remainder from the division.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_4096_div_70(const sp_digit* a, const sp_digit* d,
+        const sp_digit* m, sp_digit* r)
+{
+    int i;
+#ifndef WOLFSSL_SP_DIV_64
+    sp_int128 d1;
+#endif
+    sp_digit dv;
+    sp_digit r1;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* t1 = NULL;
+#else
+    sp_digit t1[4 * 70 + 3];
+#endif
+    sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
+    int err = MP_OKAY;
+
+    (void)m;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 70 + 3), NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+    if (t1 == NULL)
+        err = MEMORY_E;
+#endif
+
+    (void)m;
+
+    if (err == MP_OKAY) {
+        t2 = t1 + 140 + 1;
+        sd = t2 + 70 + 1;
+
+        sp_4096_mul_d_70(sd, d, (sp_digit)1 << 34);
+        sp_4096_mul_d_140(t1, a, (sp_digit)1 << 34);
+        dv = sd[69];
+        t1[70 + 70] += t1[70 + 70 - 1] >> 59;
+        t1[70 + 70 - 1] &= 0x7ffffffffffffffL;
+        for (i=70; i>=0; i--) {
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = t1[70 + i];
+            d1 <<= 59;
+            d1 += t1[70 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_70(t1[70 + i], t1[70 + i - 1], dv);
+#endif
+
+            sp_4096_mul_d_70(t2, sd, r1);
+            (void)sp_4096_sub_70(&t1[i], &t1[i], t2);
+            sp_4096_norm_70(&t1[i]);
+            t1[70 + i] -= t2[70];
+            t1[70 + i] += t1[70 + i - 1] >> 59;
+            t1[70 + i - 1] &= 0x7ffffffffffffffL;
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[70 + i];
+            d1 <<= 59;
+            d1 -= t1[70 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_70(-t1[70 + i], -t1[70 + i - 1], dv);
+#endif
+            r1 -= t1[70 + i];
+            sp_4096_mul_d_70(t2, sd, r1);
+            (void)sp_4096_add_70(&t1[i], &t1[i], t2);
+            t1[70 + i] += t1[70 + i - 1] >> 59;
+            t1[70 + i - 1] &= 0x7ffffffffffffffL;
+        }
+        t1[70 - 1] += t1[70 - 2] >> 59;
+        t1[70 - 2] &= 0x7ffffffffffffffL;
+        r1 = t1[70 - 1] / dv;
+
+        sp_4096_mul_d_70(t2, sd, r1);
+        sp_4096_sub_70(t1, t1, t2);
+        XMEMCPY(r, t1, sizeof(*r) * 140U);
+        for (i=0; i<69; i++) {
+            r[i+1] += r[i] >> 59;
+            r[i] &= 0x7ffffffffffffffL;
+        }
+        sp_4096_cond_add_70(r, r, sd, 0 - ((r[69] < 0) ?
+                    (sp_digit)1 : (sp_digit)0));
+
+        sp_4096_norm_70(r);
+        sp_4096_rshift_70(r, r, 34);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (t1 != NULL)
+        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+/* Reduce a modulo m into r. (r = a mod m)
+ *
+ * r  A single precision number that is the reduced result.
+ * a  A single precision number that is to be reduced.
+ * m  A single precision number that is the modulus to reduce with.
+ * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
+ */
+static int sp_4096_mod_70(sp_digit* r, const sp_digit* a, const sp_digit* m)
+{
+    return sp_4096_div_70(a, m, NULL, r);
+}
+
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_70(sp_digit* r, const sp_digit* a, const sp_digit* e,
+    int bits, const sp_digit* m, int reduceA)
+{
+#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 140];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 70 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 70 * 2);
+            XMEMSET(t[i], 0, sizeof(sp_digit) * 70U * 2U);
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_70(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_70(t[1], a, m);
+        }
+        else {
+            XMEMCPY(t[1], a, sizeof(sp_digit) * 70U);
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_4096_mul_70(t[1], t[1], norm);
+        err = sp_4096_mod_70(t[1], t[1], m);
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 59;
+        c = bits % 59;
+        n = e[i--] << (59 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 59;
+            }
+
+            y = (int)((n >> 58) & 1);
+            n <<= 1;
+
+            sp_4096_mont_mul_70(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 70 * 2);
+            sp_4096_mont_sqr_70(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 70 * 2);
+        }
+
+        sp_4096_mont_reduce_70(t[0], m, mp);
+        n = sp_4096_cmp_70(t[0], m);
+        sp_4096_cond_sub_70(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 70 * 2);
+
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#elif !defined(WC_NO_CACHE_RESISTANT)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[3 * 140];
+#endif
+    sp_digit* t[3] = {0, 0, 0};
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 70 * 2, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<3; i++) {
+            t[i] = td + (i * 70 * 2);
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_70(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_70(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_70(t[1], t[1], norm);
+                err = sp_4096_mod_70(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_4096_mul_70(t[1], a, norm);
+            err = sp_4096_mod_70(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        i = bits / 59;
+        c = bits % 59;
+        n = e[i--] << (59 - c);
+        for (; ; c--) {
+            if (c == 0) {
+                if (i == -1) {
+                    break;
+                }
+
+                n = e[i--];
+                c = 59;
+            }
+
+            y = (int)((n >> 58) & 1);
+            n <<= 1;
+
+            sp_4096_mont_mul_70(t[y^1], t[0], t[1], m, mp);
+
+            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
+                                  ((size_t)t[1] & addr_mask[y])),
+                                  sizeof(*t[2]) * 70 * 2);
+            sp_4096_mont_sqr_70(t[2], t[2], m, mp);
+            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
+                            ((size_t)t[1] & addr_mask[y])), t[2],
+                            sizeof(*t[2]) * 70 * 2);
+        }
+
+        sp_4096_mont_reduce_70(t[0], m, mp);
+        n = sp_4096_cmp_70(t[0], m);
+        sp_4096_cond_sub_70(t[0], t[0], m, ((n < 0) ?
+                    (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, t[0], sizeof(*r) * 70 * 2);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[(16 * 140) + 140];
+#endif
+    sp_digit* t[16];
+    sp_digit* rt = NULL;
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 140) + 140), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<16; i++)
+            t[i] = td + i * 140;
+        rt = td + 2240;
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_70(norm, m);
+
+        if (reduceA != 0) {
+            err = sp_4096_mod_70(t[1], a, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_70(t[1], t[1], norm);
+                err = sp_4096_mod_70(t[1], t[1], m);
+            }
+        }
+        else {
+            sp_4096_mul_70(t[1], a, norm);
+            err = sp_4096_mod_70(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mont_sqr_70(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_70(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_70(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_70(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_70(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_70(t[ 7], t[ 4], t[ 3], m, mp);
+        sp_4096_mont_sqr_70(t[ 8], t[ 4], m, mp);
+        sp_4096_mont_mul_70(t[ 9], t[ 5], t[ 4], m, mp);
+        sp_4096_mont_sqr_70(t[10], t[ 5], m, mp);
+        sp_4096_mont_mul_70(t[11], t[ 6], t[ 5], m, mp);
+        sp_4096_mont_sqr_70(t[12], t[ 6], m, mp);
+        sp_4096_mont_mul_70(t[13], t[ 7], t[ 6], m, mp);
+        sp_4096_mont_sqr_70(t[14], t[ 7], m, mp);
+        sp_4096_mont_mul_70(t[15], t[ 8], t[ 7], m, mp);
+
+        bits = ((bits + 3) / 4) * 4;
+        i = ((bits + 58) / 59) - 1;
+        c = bits % 59;
+        if (c == 0) {
+            c = 59;
+        }
+        if (i < 70) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 4) {
+            n |= e[i--] << (5 - c);
+            c += 59;
+        }
+        y = (int)((n >> 60) & 0xf);
+        n <<= 4;
+        c -= 4;
+        XMEMCPY(rt, t[y], sizeof(sp_digit) * 140);
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 5;
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c = 55;
+            }
+            else {
+                y = (byte)((n >> 60) & 0xf);
+                n = e[i--] << 5;
+                c = 4 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 59 - c;
+            }
+
+            sp_4096_mont_sqr_70(rt, rt, m, mp);
+            sp_4096_mont_sqr_70(rt, rt, m, mp);
+            sp_4096_mont_sqr_70(rt, rt, m, mp);
+            sp_4096_mont_sqr_70(rt, rt, m, mp);
+
+            sp_4096_mont_mul_70(rt, rt, t[y], m, mp);
+        }
+
+        sp_4096_mont_reduce_70(rt, m, mp);
+        n = sp_4096_cmp_70(rt, m);
+        sp_4096_cond_sub_70(rt, rt, m, ((n < 0) ?
+                   (sp_digit)1 : (sp_digit)0) - 1);
+        XMEMCPY(r, rt, sizeof(sp_digit) * 140);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+#endif
+}
+
+#ifdef WOLFSSL_HAVE_SP_RSA
+/* RSA public key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * em      Public exponent.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 512 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
+    const mp_int* mm, byte* out, word32* outLen)
+{
+#ifdef WOLFSSL_SP_SMALL
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[70 * 5];
+#endif
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit* norm = NULL;
+    sp_digit e[1] = {0};
+    sp_digit mp;
+    int i;
+    int err = MP_OKAY;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 59) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 512U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 70 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        r = a + 70 * 2;
+        m = r + 70 * 2;
+        norm = r;
+
+        sp_4096_from_bin(a, 70, in, inLen);
+#if DIGIT_BIT >= 59
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(m, 70, mm);
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_70(norm, m);
+    }
+    if (err == MP_OKAY) {
+        sp_4096_mul_70(a, a, norm);
+        err = sp_4096_mod_70(a, a, m);
+    }
+    if (err == MP_OKAY) {
+        for (i=58; i>=0; i--) {
+            if ((e[0] >> i) != 0) {
+                break;
+            }
+        }
+
+        XMEMCPY(r, a, sizeof(sp_digit) * 70 * 2);
+        for (i--; i>=0; i--) {
+            sp_4096_mont_sqr_70(r, r, m, mp);
+
+            if (((e[0] >> i) & 1) == 1) {
+                sp_4096_mont_mul_70(r, r, a, m, mp);
+            }
+        }
+        sp_4096_mont_reduce_70(r, m, mp);
+        mp = sp_4096_cmp_70(r, m);
+        sp_4096_cond_sub_70(r, r, m, ((mp < 0) ?
+                    (sp_digit)1 : (sp_digit)0)- 1);
+
+        sp_4096_to_bin_70(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[70 * 5];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    sp_digit e[1] = {0};
+    int err = MP_OKAY;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(em) > 59) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 512U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 70 * 5, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d;
+        r = a + 70 * 2;
+        m = r + 70 * 2;
+
+        sp_4096_from_bin(a, 70, in, inLen);
+#if DIGIT_BIT >= 59
+        e[0] = (sp_digit)em->dp[0];
+#else
+        e[0] = (sp_digit)em->dp[0];
+        if (em->used > 1) {
+            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
+        }
+#endif
+        if (e[0] == 0) {
+            err = MP_EXPTMOD_E;
+        }
+    }
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(m, 70, mm);
+
+        if (e[0] == 0x3) {
+            sp_4096_sqr_70(r, a);
+            err = sp_4096_mod_70(r, r, m);
+            if (err == MP_OKAY) {
+                sp_4096_mul_70(r, a, r);
+                err = sp_4096_mod_70(r, r, m);
+            }
+        }
+        else {
+            sp_digit* norm = r;
+            int i;
+            sp_digit mp;
+
+            sp_4096_mont_setup(m, &mp);
+            sp_4096_mont_norm_70(norm, m);
+
+            sp_4096_mul_70(a, a, norm);
+            err = sp_4096_mod_70(a, a, m);
+
+            if (err == MP_OKAY) {
+                for (i=58; i>=0; i--) {
+                    if ((e[0] >> i) != 0) {
+                        break;
+                    }
+                }
+
+                XMEMCPY(r, a, sizeof(sp_digit) * 140U);
+                for (i--; i>=0; i--) {
+                    sp_4096_mont_sqr_70(r, r, m, mp);
+
+                    if (((e[0] >> i) & 1) == 1) {
+                        sp_4096_mont_mul_70(r, r, a, m, mp);
+                    }
+                }
+                sp_4096_mont_reduce_70(r, m, mp);
+                mp = sp_4096_cmp_70(r, m);
+                sp_4096_cond_sub_70(r, r, m, ((mp < 0) ?
+                           (sp_digit)1 : (sp_digit)0) - 1);
+            }
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_to_bin_70(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+}
+
+#ifndef WOLFSSL_RSA_PUBLIC_ONLY
+#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
+#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
+/* RSA private key operation.
+ *
+ * in      Array of bytes representing the number to exponentiate, base.
+ * inLen   Number of bytes in base.
+ * dm      Private exponent.
+ * pm      First prime.
+ * qm      Second prime.
+ * dpm     First prime's CRT exponent.
+ * dqm     Second prime's CRT exponent.
+ * qim     Inverse of second prime mod p.
+ * mm      Modulus.
+ * out     Buffer to hold big-endian bytes of exponentiation result.
+ *         Must be at least 512 bytes long.
+ * outLen  Number of bytes in result.
+ * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
+ * an array is too long and MEMORY_E when dynamic memory allocation fails.
+ */
+int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
+    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
+    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
+{
+#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit  d[70 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 4096) {
+           err = MP_READ_E;
+        }
+        else if (inLen > 512) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 70 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 70;
+        m = a + 140;
+        r = a;
+
+        sp_4096_from_bin(a, 70, in, inLen);
+        sp_4096_from_mp(d, 70, dm);
+        sp_4096_from_mp(m, 70, mm);
+        err = sp_4096_mod_exp_70(r, a, d, 4096, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_to_bin_70(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 70);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* d = NULL;
+#else
+    sp_digit d[70 * 4];
+#endif
+    sp_digit* a = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)pm;
+    (void)qm;
+    (void)dpm;
+    (void)dqm;
+    (void)qim;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (mp_count_bits(dm) > 4096) {
+            err = MP_READ_E;
+        }
+        else if (inLen > 512U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 70 * 4, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (d == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        a = d + 70;
+        m = a + 140;
+        r = a;
+
+        sp_4096_from_bin(a, 70, in, inLen);
+        sp_4096_from_mp(d, 70, dm);
+        sp_4096_from_mp(m, 70, mm);
+        err = sp_4096_mod_exp_70(r, a, d, 4096, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_to_bin_70(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (d != NULL)
+#endif
+    {
+        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
+        if (a != NULL)
+            ForceZero(a, sizeof(sp_digit) * 70);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#else
+#if defined(WOLFSSL_SP_SMALL)
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[35 * 8];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 512) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 35 * 8, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+    if (err == MP_OKAY) {
+        p = a + 70;
+        qi = dq = dp = p + 35;
+        tmpa = qi + 35;
+        tmpb = tmpa + 70;
+        r = a;
+
+        sp_4096_from_bin(a, 70, in, inLen);
+        sp_4096_from_mp(p, 35, pm);
+        sp_4096_from_mp(dp, 35, dpm);
+        err = sp_4096_mod_exp_35(tmpa, a, dp, 2048, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(p, 35, qm);
+        sp_4096_from_mp(dq, 35, dqm);
+        err = sp_4096_mod_exp_35(tmpb, a, dq, 2048, p, 1);
+    }
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(p, 35, pm);
+        (void)sp_4096_sub_35(tmpa, tmpa, tmpb);
+        sp_4096_norm_35(tmpa);
+        sp_4096_cond_add_35(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[34] >> 63));
+        sp_4096_cond_add_35(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[34] >> 63));
+
+        sp_4096_from_mp(qi, 35, qim);
+        sp_4096_mul_35(tmpa, tmpa, qi);
+        err = sp_4096_mod_35(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_from_mp(p, 35, qm);
+        sp_4096_mul_35(tmpa, p, tmpa);
+        (void)sp_4096_add_70(r, tmpb, tmpa);
+        sp_4096_norm_70(r);
+
+        sp_4096_to_bin_70(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 35 * 8);
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+#endif
+    }
+
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* a = NULL;
+#else
+    sp_digit a[35 * 13];
+#endif
+    sp_digit* p = NULL;
+    sp_digit* q = NULL;
+    sp_digit* dp = NULL;
+    sp_digit* dq = NULL;
+    sp_digit* qi = NULL;
+    sp_digit* tmpa = NULL;
+    sp_digit* tmpb = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+
+    (void)dm;
+    (void)mm;
+
+    if (*outLen < 512U) {
+        err = MP_TO_E;
+    }
+    if (err == MP_OKAY) {
+        if (inLen > 512U) {
+            err = MP_READ_E;
+        }
+        else if (mp_count_bits(mm) != 4096) {
+            err = MP_READ_E;
+        }
+        else if (mp_iseven(mm)) {
+            err = MP_VAL;
+        }
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 35 * 13, NULL,
+                                                              DYNAMIC_TYPE_RSA);
+        if (a == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        p = a + 70 * 2;
+        q = p + 35;
+        dp = q + 35;
+        dq = dp + 35;
+        qi = dq + 35;
+        tmpa = qi + 35;
+        tmpb = tmpa + 70;
+        r = a;
+
+        sp_4096_from_bin(a, 70, in, inLen);
+        sp_4096_from_mp(p, 35, pm);
+        sp_4096_from_mp(q, 35, qm);
+        sp_4096_from_mp(dp, 35, dpm);
+        sp_4096_from_mp(dq, 35, dqm);
+        sp_4096_from_mp(qi, 35, qim);
+
+        err = sp_4096_mod_exp_35(tmpa, a, dp, 2048, p, 1);
+    }
+    if (err == MP_OKAY) {
+        err = sp_4096_mod_exp_35(tmpb, a, dq, 2048, q, 1);
+    }
+
+    if (err == MP_OKAY) {
+        (void)sp_4096_sub_35(tmpa, tmpa, tmpb);
+        sp_4096_norm_35(tmpa);
+        sp_4096_cond_add_35(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[34] >> 63));
+        sp_4096_cond_add_35(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[34] >> 63));
+        sp_4096_mul_35(tmpa, tmpa, qi);
+        err = sp_4096_mod_35(tmpa, tmpa, p);
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mul_35(tmpa, tmpa, q);
+        (void)sp_4096_add_70(r, tmpb, tmpa);
+        sp_4096_norm_70(r);
+
+        sp_4096_to_bin_70(r, out);
+        *outLen = 512;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+if (a != NULL)
+#endif
+    {
+        ForceZero(a, sizeof(sp_digit) * 35 * 13);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
+    #endif
+    }
+
+    return err;
+#endif /* WOLFSSL_SP_SMALL */
+#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
+}
+
+#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
+#endif /* WOLFSSL_HAVE_SP_RSA */
+#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
+                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
+/* Convert an array of sp_digit to an mp_int.
+ *
+ * a  A single precision integer.
+ * r  A multi-precision integer.
+ */
+static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
+{
+    int err;
+
+    err = mp_grow(r, (4096 + DIGIT_BIT - 1) / DIGIT_BIT);
+    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
+#if DIGIT_BIT == 59
+        XMEMCPY(r->dp, a, sizeof(sp_digit) * 70);
+        r->used = 70;
+        mp_clamp(r);
+#elif DIGIT_BIT < 59
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 70; i++) {
+            r->dp[j] |= (mp_digit)(a[i] << s);
+            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+            s = DIGIT_BIT - s;
+            r->dp[++j] = (mp_digit)(a[i] >> s);
+            while (s + DIGIT_BIT <= 59) {
+                s += DIGIT_BIT;
+                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+                if (s == SP_WORD_SIZE) {
+                    r->dp[j] = 0;
+                }
+                else {
+                    r->dp[j] = (mp_digit)(a[i] >> s);
+                }
+            }
+            s = 59 - s;
+        }
+        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#else
+        int i;
+        int j = 0;
+        int s = 0;
+
+        r->dp[0] = 0;
+        for (i = 0; i < 70; i++) {
+            r->dp[j] |= ((mp_digit)a[i]) << s;
+            if (s + 59 >= DIGIT_BIT) {
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
+                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
+    #endif
+                s = DIGIT_BIT - s;
+                r->dp[++j] = a[i] >> s;
+                s = 59 - s;
+            }
+            else {
+                s += 59;
+            }
+        }
+        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
+        mp_clamp(r);
+#endif
+    }
+
+    return err;
+}
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base  Base. MP integer.
+ * exp   Exponent. MP integer.
+ * mod   Modulus. MP integer.
+ * res   Result. MP integer.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
+    mp_int* res)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int err = MP_OKAY;
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[70 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 4096) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 70 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 70 * 2;
+        m = e + 70;
+        r = b;
+
+        sp_4096_from_mp(b, 70, base);
+        sp_4096_from_mp(e, 70, exp);
+        sp_4096_from_mp(m, 70, mod);
+
+        err = sp_4096_mod_exp_70(r, b, e, mp_count_bits(exp), m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_4096_to_mp(r, res);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 70U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+    return err;
+#else
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[70 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    int err = MP_OKAY;
+    int expBits = mp_count_bits(exp);
+
+    if (mp_count_bits(base) > 4096) {
+        err = MP_READ_E;
+    }
+    else if (expBits > 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 70 * 4, NULL, DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 70 * 2;
+        m = e + 70;
+        r = b;
+
+        sp_4096_from_mp(b, 70, base);
+        sp_4096_from_mp(e, 70, exp);
+        sp_4096_from_mp(m, 70, mod);
+
+        err = sp_4096_mod_exp_70(r, b, e, expBits, m, 0);
+    }
+
+    if (err == MP_OKAY) {
+        err = sp_4096_to_mp(r, res);
+    }
+
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 70U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+#endif
+}
+
+#ifdef WOLFSSL_HAVE_SP_DH
+
+#ifdef HAVE_FFDHE_4096
+SP_NOINLINE static void sp_4096_lshift_70(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+    r[70] = a[69] >> (59 - n);
+    for (i=69; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (59 - n))) & 0x7ffffffffffffffL;
+    }
+    r[0] = (a[0] << n) & 0x7ffffffffffffffL;
+}
+
+/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_2_70(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[211];
+#endif
+    sp_digit* norm = NULL;
+    sp_digit* tmp = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit o;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 211, NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        tmp  = td + 140;
+        XMEMSET(td, 0, sizeof(sp_digit) * 211);
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_70(norm, m);
+
+        bits = ((bits + 4) / 5) * 5;
+        i = ((bits + 58) / 59) - 1;
+        c = bits % 59;
+        if (c == 0) {
+            c = 59;
+        }
+        if (i < 70) {
+            n = e[i--] << (64 - c);
+        }
+        else {
+            n = 0;
+            i--;
+        }
+        if (c < 5) {
+            n |= e[i--] << (5 - c);
+            c += 59;
+        }
+        y = (int)((n >> 59) & 0x1f);
+        n <<= 5;
+        c -= 5;
+        sp_4096_lshift_70(r, norm, (byte)y);
+        while ((i >= 0) || (c >= 5)) {
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 5;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 54;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 5;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 59 - c;
+            }
+
+            sp_4096_mont_sqr_70(r, r, m, mp);
+            sp_4096_mont_sqr_70(r, r, m, mp);
+            sp_4096_mont_sqr_70(r, r, m, mp);
+            sp_4096_mont_sqr_70(r, r, m, mp);
+            sp_4096_mont_sqr_70(r, r, m, mp);
+
+            sp_4096_lshift_70(r, r, (byte)y);
+            sp_4096_mul_d_70(tmp, norm, (r[70] << 34) + (r[69] >> 25));
+            r[70] = 0;
+            r[69] &= 0x1ffffffL;
+            (void)sp_4096_add_70(r, r, tmp);
+            sp_4096_norm_70(r);
+            o = sp_4096_cmp_70(r, m);
+            sp_4096_cond_sub_70(r, r, m, ((o < 0) ?
+                                          (sp_digit)1 : (sp_digit)0) - 1);
+        }
+
+        sp_4096_mont_reduce_70(r, m, mp);
+        n = sp_4096_cmp_70(r, m);
+        sp_4096_cond_sub_70(r, r, m, ((n < 0) ?
+                                                (sp_digit)1 : (sp_digit)0) - 1);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+
+#endif /* HAVE_FFDHE_4096 */
+
+/* Perform the modular exponentiation for Diffie-Hellman.
+ *
+ * base     Base.
+ * exp      Array of bytes that is the exponent.
+ * expLen   Length of data, in bytes, in exponent.
+ * mod      Modulus.
+ * out      Buffer to hold big-endian bytes of exponentiation result.
+ *          Must be at least 512 bytes long.
+ * outLen   Length, in bytes, of exponentiation result.
+ * returns 0 on success, MP_READ_E if there are too many bytes in an array
+ * and MEMORY_E if memory allocation fails.
+ */
+int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
+    const mp_int* mod, byte* out, word32* outLen)
+{
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* b = NULL;
+#else
+    sp_digit b[70 * 4];
+#endif
+    sp_digit* e = NULL;
+    sp_digit* m = NULL;
+    sp_digit* r = NULL;
+    word32 i;
+    int err = MP_OKAY;
+
+    if (mp_count_bits(base) > 4096) {
+        err = MP_READ_E;
+    }
+    else if (expLen > 512U) {
+        err = MP_READ_E;
+    }
+    else if (mp_count_bits(mod) != 4096) {
+        err = MP_READ_E;
+    }
+    else if (mp_iseven(mod)) {
+        err = MP_VAL;
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (err == MP_OKAY) {
+        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 70 * 4, NULL,
+            DYNAMIC_TYPE_DH);
+        if (b == NULL)
+            err = MEMORY_E;
+    }
+#endif
+
+    if (err == MP_OKAY) {
+        e = b + 70 * 2;
+        m = e + 70;
+        r = b;
+
+        sp_4096_from_mp(b, 70, base);
+        sp_4096_from_bin(e, 70, exp, expLen);
+        sp_4096_from_mp(m, 70, mod);
+
+    #ifdef HAVE_FFDHE_4096
+        if (base->used == 1 && base->dp[0] == 2U &&
+                ((m[69] << 7) | (m[68] >> 52)) == 0xffffffffL) {
+            err = sp_4096_mod_exp_2_70(r, e, expLen * 8U, m);
+        }
+        else {
+    #endif
+            err = sp_4096_mod_exp_70(r, b, e, expLen * 8U, m, 0);
+    #ifdef HAVE_FFDHE_4096
+        }
+    #endif
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_to_bin_70(r, out);
+        *outLen = 512;
+        for (i=0; i<512U && out[i] == 0U; i++) {
+            /* Search for first non-zero. */
+        }
+        *outLen -= i;
+        XMEMMOVE(out, out + i, *outLen);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (b != NULL)
+#endif
+    {
+        /* only "e" is sensitive and needs zeroized */
+        if (e != NULL)
+            ForceZero(e, sizeof(sp_digit) * 70U);
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+        XFREE(b, NULL, DYNAMIC_TYPE_DH);
+    #endif
+    }
+
+    return err;
+}
+#endif /* WOLFSSL_HAVE_SP_DH */
+
+#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
+
+#else
 /* Read big endian unsigned byte array into r.
  *
  * r  A single precision integer.
@@ -8448,7 +17685,7 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_4096_to_bin(sp_digit* r, byte* a)
+static void sp_4096_to_bin_78(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -8486,6 +17723,57 @@ static void sp_4096_to_bin(sp_digit* r, byte* a)
     }
 }
 
+#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
+/* Normalize the values in each word to 53 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_39(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 32; i += 8) {
+        a[i+1] += a[i+0] >> 53; a[i+0] &= 0x1fffffffffffffL;
+        a[i+2] += a[i+1] >> 53; a[i+1] &= 0x1fffffffffffffL;
+        a[i+3] += a[i+2] >> 53; a[i+2] &= 0x1fffffffffffffL;
+        a[i+4] += a[i+3] >> 53; a[i+3] &= 0x1fffffffffffffL;
+        a[i+5] += a[i+4] >> 53; a[i+4] &= 0x1fffffffffffffL;
+        a[i+6] += a[i+5] >> 53; a[i+5] &= 0x1fffffffffffffL;
+        a[i+7] += a[i+6] >> 53; a[i+6] &= 0x1fffffffffffffL;
+        a[i+8] += a[i+7] >> 53; a[i+7] &= 0x1fffffffffffffL;
+    }
+    a[33] += a[32] >> 53; a[32] &= 0x1fffffffffffffL;
+    a[34] += a[33] >> 53; a[33] &= 0x1fffffffffffffL;
+    a[35] += a[34] >> 53; a[34] &= 0x1fffffffffffffL;
+    a[36] += a[35] >> 53; a[35] &= 0x1fffffffffffffL;
+    a[37] += a[36] >> 53; a[36] &= 0x1fffffffffffffL;
+    a[38] += a[37] >> 53; a[37] &= 0x1fffffffffffffL;
+}
+
+#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
+/* Normalize the values in each word to 53 bits.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_4096_norm_78(sp_digit* a)
+{
+    int i;
+    for (i = 0; i < 72; i += 8) {
+        a[i+1] += a[i+0] >> 53; a[i+0] &= 0x1fffffffffffffL;
+        a[i+2] += a[i+1] >> 53; a[i+1] &= 0x1fffffffffffffL;
+        a[i+3] += a[i+2] >> 53; a[i+2] &= 0x1fffffffffffffL;
+        a[i+4] += a[i+3] >> 53; a[i+3] &= 0x1fffffffffffffL;
+        a[i+5] += a[i+4] >> 53; a[i+4] &= 0x1fffffffffffffL;
+        a[i+6] += a[i+5] >> 53; a[i+5] &= 0x1fffffffffffffL;
+        a[i+7] += a[i+6] >> 53; a[i+6] &= 0x1fffffffffffffL;
+        a[i+8] += a[i+7] >> 53; a[i+7] &= 0x1fffffffffffffL;
+    }
+    a[73] += a[72] >> 53; a[72] &= 0x1fffffffffffffL;
+    a[74] += a[73] >> 53; a[73] &= 0x1fffffffffffffL;
+    a[75] += a[74] >> 53; a[74] &= 0x1fffffffffffffL;
+    a[76] += a[75] >> 53; a[75] &= 0x1fffffffffffffL;
+    a[77] += a[76] >> 53; a[76] &= 0x1fffffffffffffL;
+}
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -8496,175 +17784,175 @@ static void sp_4096_to_bin(sp_digit* r, byte* a)
 SP_NOINLINE static void sp_4096_mul_13(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
-    int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
-                 + ((int128_t)a[ 1]) * b[ 0];
-    int128_t t2   = ((int128_t)a[ 0]) * b[ 2]
-                 + ((int128_t)a[ 1]) * b[ 1]
-                 + ((int128_t)a[ 2]) * b[ 0];
-    int128_t t3   = ((int128_t)a[ 0]) * b[ 3]
-                 + ((int128_t)a[ 1]) * b[ 2]
-                 + ((int128_t)a[ 2]) * b[ 1]
-                 + ((int128_t)a[ 3]) * b[ 0];
-    int128_t t4   = ((int128_t)a[ 0]) * b[ 4]
-                 + ((int128_t)a[ 1]) * b[ 3]
-                 + ((int128_t)a[ 2]) * b[ 2]
-                 + ((int128_t)a[ 3]) * b[ 1]
-                 + ((int128_t)a[ 4]) * b[ 0];
-    int128_t t5   = ((int128_t)a[ 0]) * b[ 5]
-                 + ((int128_t)a[ 1]) * b[ 4]
-                 + ((int128_t)a[ 2]) * b[ 3]
-                 + ((int128_t)a[ 3]) * b[ 2]
-                 + ((int128_t)a[ 4]) * b[ 1]
-                 + ((int128_t)a[ 5]) * b[ 0];
-    int128_t t6   = ((int128_t)a[ 0]) * b[ 6]
-                 + ((int128_t)a[ 1]) * b[ 5]
-                 + ((int128_t)a[ 2]) * b[ 4]
-                 + ((int128_t)a[ 3]) * b[ 3]
-                 + ((int128_t)a[ 4]) * b[ 2]
-                 + ((int128_t)a[ 5]) * b[ 1]
-                 + ((int128_t)a[ 6]) * b[ 0];
-    int128_t t7   = ((int128_t)a[ 0]) * b[ 7]
-                 + ((int128_t)a[ 1]) * b[ 6]
-                 + ((int128_t)a[ 2]) * b[ 5]
-                 + ((int128_t)a[ 3]) * b[ 4]
-                 + ((int128_t)a[ 4]) * b[ 3]
-                 + ((int128_t)a[ 5]) * b[ 2]
-                 + ((int128_t)a[ 6]) * b[ 1]
-                 + ((int128_t)a[ 7]) * b[ 0];
-    int128_t t8   = ((int128_t)a[ 0]) * b[ 8]
-                 + ((int128_t)a[ 1]) * b[ 7]
-                 + ((int128_t)a[ 2]) * b[ 6]
-                 + ((int128_t)a[ 3]) * b[ 5]
-                 + ((int128_t)a[ 4]) * b[ 4]
-                 + ((int128_t)a[ 5]) * b[ 3]
-                 + ((int128_t)a[ 6]) * b[ 2]
-                 + ((int128_t)a[ 7]) * b[ 1]
-                 + ((int128_t)a[ 8]) * b[ 0];
-    int128_t t9   = ((int128_t)a[ 0]) * b[ 9]
-                 + ((int128_t)a[ 1]) * b[ 8]
-                 + ((int128_t)a[ 2]) * b[ 7]
-                 + ((int128_t)a[ 3]) * b[ 6]
-                 + ((int128_t)a[ 4]) * b[ 5]
-                 + ((int128_t)a[ 5]) * b[ 4]
-                 + ((int128_t)a[ 6]) * b[ 3]
-                 + ((int128_t)a[ 7]) * b[ 2]
-                 + ((int128_t)a[ 8]) * b[ 1]
-                 + ((int128_t)a[ 9]) * b[ 0];
-    int128_t t10  = ((int128_t)a[ 0]) * b[10]
-                 + ((int128_t)a[ 1]) * b[ 9]
-                 + ((int128_t)a[ 2]) * b[ 8]
-                 + ((int128_t)a[ 3]) * b[ 7]
-                 + ((int128_t)a[ 4]) * b[ 6]
-                 + ((int128_t)a[ 5]) * b[ 5]
-                 + ((int128_t)a[ 6]) * b[ 4]
-                 + ((int128_t)a[ 7]) * b[ 3]
-                 + ((int128_t)a[ 8]) * b[ 2]
-                 + ((int128_t)a[ 9]) * b[ 1]
-                 + ((int128_t)a[10]) * b[ 0];
-    int128_t t11  = ((int128_t)a[ 0]) * b[11]
-                 + ((int128_t)a[ 1]) * b[10]
-                 + ((int128_t)a[ 2]) * b[ 9]
-                 + ((int128_t)a[ 3]) * b[ 8]
-                 + ((int128_t)a[ 4]) * b[ 7]
-                 + ((int128_t)a[ 5]) * b[ 6]
-                 + ((int128_t)a[ 6]) * b[ 5]
-                 + ((int128_t)a[ 7]) * b[ 4]
-                 + ((int128_t)a[ 8]) * b[ 3]
-                 + ((int128_t)a[ 9]) * b[ 2]
-                 + ((int128_t)a[10]) * b[ 1]
-                 + ((int128_t)a[11]) * b[ 0];
-    int128_t t12  = ((int128_t)a[ 0]) * b[12]
-                 + ((int128_t)a[ 1]) * b[11]
-                 + ((int128_t)a[ 2]) * b[10]
-                 + ((int128_t)a[ 3]) * b[ 9]
-                 + ((int128_t)a[ 4]) * b[ 8]
-                 + ((int128_t)a[ 5]) * b[ 7]
-                 + ((int128_t)a[ 6]) * b[ 6]
-                 + ((int128_t)a[ 7]) * b[ 5]
-                 + ((int128_t)a[ 8]) * b[ 4]
-                 + ((int128_t)a[ 9]) * b[ 3]
-                 + ((int128_t)a[10]) * b[ 2]
-                 + ((int128_t)a[11]) * b[ 1]
-                 + ((int128_t)a[12]) * b[ 0];
-    int128_t t13  = ((int128_t)a[ 1]) * b[12]
-                 + ((int128_t)a[ 2]) * b[11]
-                 + ((int128_t)a[ 3]) * b[10]
-                 + ((int128_t)a[ 4]) * b[ 9]
-                 + ((int128_t)a[ 5]) * b[ 8]
-                 + ((int128_t)a[ 6]) * b[ 7]
-                 + ((int128_t)a[ 7]) * b[ 6]
-                 + ((int128_t)a[ 8]) * b[ 5]
-                 + ((int128_t)a[ 9]) * b[ 4]
-                 + ((int128_t)a[10]) * b[ 3]
-                 + ((int128_t)a[11]) * b[ 2]
-                 + ((int128_t)a[12]) * b[ 1];
-    int128_t t14  = ((int128_t)a[ 2]) * b[12]
-                 + ((int128_t)a[ 3]) * b[11]
-                 + ((int128_t)a[ 4]) * b[10]
-                 + ((int128_t)a[ 5]) * b[ 9]
-                 + ((int128_t)a[ 6]) * b[ 8]
-                 + ((int128_t)a[ 7]) * b[ 7]
-                 + ((int128_t)a[ 8]) * b[ 6]
-                 + ((int128_t)a[ 9]) * b[ 5]
-                 + ((int128_t)a[10]) * b[ 4]
-                 + ((int128_t)a[11]) * b[ 3]
-                 + ((int128_t)a[12]) * b[ 2];
-    int128_t t15  = ((int128_t)a[ 3]) * b[12]
-                 + ((int128_t)a[ 4]) * b[11]
-                 + ((int128_t)a[ 5]) * b[10]
-                 + ((int128_t)a[ 6]) * b[ 9]
-                 + ((int128_t)a[ 7]) * b[ 8]
-                 + ((int128_t)a[ 8]) * b[ 7]
-                 + ((int128_t)a[ 9]) * b[ 6]
-                 + ((int128_t)a[10]) * b[ 5]
-                 + ((int128_t)a[11]) * b[ 4]
-                 + ((int128_t)a[12]) * b[ 3];
-    int128_t t16  = ((int128_t)a[ 4]) * b[12]
-                 + ((int128_t)a[ 5]) * b[11]
-                 + ((int128_t)a[ 6]) * b[10]
-                 + ((int128_t)a[ 7]) * b[ 9]
-                 + ((int128_t)a[ 8]) * b[ 8]
-                 + ((int128_t)a[ 9]) * b[ 7]
-                 + ((int128_t)a[10]) * b[ 6]
-                 + ((int128_t)a[11]) * b[ 5]
-                 + ((int128_t)a[12]) * b[ 4];
-    int128_t t17  = ((int128_t)a[ 5]) * b[12]
-                 + ((int128_t)a[ 6]) * b[11]
-                 + ((int128_t)a[ 7]) * b[10]
-                 + ((int128_t)a[ 8]) * b[ 9]
-                 + ((int128_t)a[ 9]) * b[ 8]
-                 + ((int128_t)a[10]) * b[ 7]
-                 + ((int128_t)a[11]) * b[ 6]
-                 + ((int128_t)a[12]) * b[ 5];
-    int128_t t18  = ((int128_t)a[ 6]) * b[12]
-                 + ((int128_t)a[ 7]) * b[11]
-                 + ((int128_t)a[ 8]) * b[10]
-                 + ((int128_t)a[ 9]) * b[ 9]
-                 + ((int128_t)a[10]) * b[ 8]
-                 + ((int128_t)a[11]) * b[ 7]
-                 + ((int128_t)a[12]) * b[ 6];
-    int128_t t19  = ((int128_t)a[ 7]) * b[12]
-                 + ((int128_t)a[ 8]) * b[11]
-                 + ((int128_t)a[ 9]) * b[10]
-                 + ((int128_t)a[10]) * b[ 9]
-                 + ((int128_t)a[11]) * b[ 8]
-                 + ((int128_t)a[12]) * b[ 7];
-    int128_t t20  = ((int128_t)a[ 8]) * b[12]
-                 + ((int128_t)a[ 9]) * b[11]
-                 + ((int128_t)a[10]) * b[10]
-                 + ((int128_t)a[11]) * b[ 9]
-                 + ((int128_t)a[12]) * b[ 8];
-    int128_t t21  = ((int128_t)a[ 9]) * b[12]
-                 + ((int128_t)a[10]) * b[11]
-                 + ((int128_t)a[11]) * b[10]
-                 + ((int128_t)a[12]) * b[ 9];
-    int128_t t22  = ((int128_t)a[10]) * b[12]
-                 + ((int128_t)a[11]) * b[11]
-                 + ((int128_t)a[12]) * b[10];
-    int128_t t23  = ((int128_t)a[11]) * b[12]
-                 + ((int128_t)a[12]) * b[11];
-    int128_t t24  = ((int128_t)a[12]) * b[12];
+    sp_uint128 t0   = ((sp_uint128)a[ 0]) * b[ 0];
+    sp_uint128 t1   = ((sp_uint128)a[ 0]) * b[ 1]
+                 + ((sp_uint128)a[ 1]) * b[ 0];
+    sp_uint128 t2   = ((sp_uint128)a[ 0]) * b[ 2]
+                 + ((sp_uint128)a[ 1]) * b[ 1]
+                 + ((sp_uint128)a[ 2]) * b[ 0];
+    sp_uint128 t3   = ((sp_uint128)a[ 0]) * b[ 3]
+                 + ((sp_uint128)a[ 1]) * b[ 2]
+                 + ((sp_uint128)a[ 2]) * b[ 1]
+                 + ((sp_uint128)a[ 3]) * b[ 0];
+    sp_uint128 t4   = ((sp_uint128)a[ 0]) * b[ 4]
+                 + ((sp_uint128)a[ 1]) * b[ 3]
+                 + ((sp_uint128)a[ 2]) * b[ 2]
+                 + ((sp_uint128)a[ 3]) * b[ 1]
+                 + ((sp_uint128)a[ 4]) * b[ 0];
+    sp_uint128 t5   = ((sp_uint128)a[ 0]) * b[ 5]
+                 + ((sp_uint128)a[ 1]) * b[ 4]
+                 + ((sp_uint128)a[ 2]) * b[ 3]
+                 + ((sp_uint128)a[ 3]) * b[ 2]
+                 + ((sp_uint128)a[ 4]) * b[ 1]
+                 + ((sp_uint128)a[ 5]) * b[ 0];
+    sp_uint128 t6   = ((sp_uint128)a[ 0]) * b[ 6]
+                 + ((sp_uint128)a[ 1]) * b[ 5]
+                 + ((sp_uint128)a[ 2]) * b[ 4]
+                 + ((sp_uint128)a[ 3]) * b[ 3]
+                 + ((sp_uint128)a[ 4]) * b[ 2]
+                 + ((sp_uint128)a[ 5]) * b[ 1]
+                 + ((sp_uint128)a[ 6]) * b[ 0];
+    sp_uint128 t7   = ((sp_uint128)a[ 0]) * b[ 7]
+                 + ((sp_uint128)a[ 1]) * b[ 6]
+                 + ((sp_uint128)a[ 2]) * b[ 5]
+                 + ((sp_uint128)a[ 3]) * b[ 4]
+                 + ((sp_uint128)a[ 4]) * b[ 3]
+                 + ((sp_uint128)a[ 5]) * b[ 2]
+                 + ((sp_uint128)a[ 6]) * b[ 1]
+                 + ((sp_uint128)a[ 7]) * b[ 0];
+    sp_uint128 t8   = ((sp_uint128)a[ 0]) * b[ 8]
+                 + ((sp_uint128)a[ 1]) * b[ 7]
+                 + ((sp_uint128)a[ 2]) * b[ 6]
+                 + ((sp_uint128)a[ 3]) * b[ 5]
+                 + ((sp_uint128)a[ 4]) * b[ 4]
+                 + ((sp_uint128)a[ 5]) * b[ 3]
+                 + ((sp_uint128)a[ 6]) * b[ 2]
+                 + ((sp_uint128)a[ 7]) * b[ 1]
+                 + ((sp_uint128)a[ 8]) * b[ 0];
+    sp_uint128 t9   = ((sp_uint128)a[ 0]) * b[ 9]
+                 + ((sp_uint128)a[ 1]) * b[ 8]
+                 + ((sp_uint128)a[ 2]) * b[ 7]
+                 + ((sp_uint128)a[ 3]) * b[ 6]
+                 + ((sp_uint128)a[ 4]) * b[ 5]
+                 + ((sp_uint128)a[ 5]) * b[ 4]
+                 + ((sp_uint128)a[ 6]) * b[ 3]
+                 + ((sp_uint128)a[ 7]) * b[ 2]
+                 + ((sp_uint128)a[ 8]) * b[ 1]
+                 + ((sp_uint128)a[ 9]) * b[ 0];
+    sp_uint128 t10  = ((sp_uint128)a[ 0]) * b[10]
+                 + ((sp_uint128)a[ 1]) * b[ 9]
+                 + ((sp_uint128)a[ 2]) * b[ 8]
+                 + ((sp_uint128)a[ 3]) * b[ 7]
+                 + ((sp_uint128)a[ 4]) * b[ 6]
+                 + ((sp_uint128)a[ 5]) * b[ 5]
+                 + ((sp_uint128)a[ 6]) * b[ 4]
+                 + ((sp_uint128)a[ 7]) * b[ 3]
+                 + ((sp_uint128)a[ 8]) * b[ 2]
+                 + ((sp_uint128)a[ 9]) * b[ 1]
+                 + ((sp_uint128)a[10]) * b[ 0];
+    sp_uint128 t11  = ((sp_uint128)a[ 0]) * b[11]
+                 + ((sp_uint128)a[ 1]) * b[10]
+                 + ((sp_uint128)a[ 2]) * b[ 9]
+                 + ((sp_uint128)a[ 3]) * b[ 8]
+                 + ((sp_uint128)a[ 4]) * b[ 7]
+                 + ((sp_uint128)a[ 5]) * b[ 6]
+                 + ((sp_uint128)a[ 6]) * b[ 5]
+                 + ((sp_uint128)a[ 7]) * b[ 4]
+                 + ((sp_uint128)a[ 8]) * b[ 3]
+                 + ((sp_uint128)a[ 9]) * b[ 2]
+                 + ((sp_uint128)a[10]) * b[ 1]
+                 + ((sp_uint128)a[11]) * b[ 0];
+    sp_uint128 t12  = ((sp_uint128)a[ 0]) * b[12]
+                 + ((sp_uint128)a[ 1]) * b[11]
+                 + ((sp_uint128)a[ 2]) * b[10]
+                 + ((sp_uint128)a[ 3]) * b[ 9]
+                 + ((sp_uint128)a[ 4]) * b[ 8]
+                 + ((sp_uint128)a[ 5]) * b[ 7]
+                 + ((sp_uint128)a[ 6]) * b[ 6]
+                 + ((sp_uint128)a[ 7]) * b[ 5]
+                 + ((sp_uint128)a[ 8]) * b[ 4]
+                 + ((sp_uint128)a[ 9]) * b[ 3]
+                 + ((sp_uint128)a[10]) * b[ 2]
+                 + ((sp_uint128)a[11]) * b[ 1]
+                 + ((sp_uint128)a[12]) * b[ 0];
+    sp_uint128 t13  = ((sp_uint128)a[ 1]) * b[12]
+                 + ((sp_uint128)a[ 2]) * b[11]
+                 + ((sp_uint128)a[ 3]) * b[10]
+                 + ((sp_uint128)a[ 4]) * b[ 9]
+                 + ((sp_uint128)a[ 5]) * b[ 8]
+                 + ((sp_uint128)a[ 6]) * b[ 7]
+                 + ((sp_uint128)a[ 7]) * b[ 6]
+                 + ((sp_uint128)a[ 8]) * b[ 5]
+                 + ((sp_uint128)a[ 9]) * b[ 4]
+                 + ((sp_uint128)a[10]) * b[ 3]
+                 + ((sp_uint128)a[11]) * b[ 2]
+                 + ((sp_uint128)a[12]) * b[ 1];
+    sp_uint128 t14  = ((sp_uint128)a[ 2]) * b[12]
+                 + ((sp_uint128)a[ 3]) * b[11]
+                 + ((sp_uint128)a[ 4]) * b[10]
+                 + ((sp_uint128)a[ 5]) * b[ 9]
+                 + ((sp_uint128)a[ 6]) * b[ 8]
+                 + ((sp_uint128)a[ 7]) * b[ 7]
+                 + ((sp_uint128)a[ 8]) * b[ 6]
+                 + ((sp_uint128)a[ 9]) * b[ 5]
+                 + ((sp_uint128)a[10]) * b[ 4]
+                 + ((sp_uint128)a[11]) * b[ 3]
+                 + ((sp_uint128)a[12]) * b[ 2];
+    sp_uint128 t15  = ((sp_uint128)a[ 3]) * b[12]
+                 + ((sp_uint128)a[ 4]) * b[11]
+                 + ((sp_uint128)a[ 5]) * b[10]
+                 + ((sp_uint128)a[ 6]) * b[ 9]
+                 + ((sp_uint128)a[ 7]) * b[ 8]
+                 + ((sp_uint128)a[ 8]) * b[ 7]
+                 + ((sp_uint128)a[ 9]) * b[ 6]
+                 + ((sp_uint128)a[10]) * b[ 5]
+                 + ((sp_uint128)a[11]) * b[ 4]
+                 + ((sp_uint128)a[12]) * b[ 3];
+    sp_uint128 t16  = ((sp_uint128)a[ 4]) * b[12]
+                 + ((sp_uint128)a[ 5]) * b[11]
+                 + ((sp_uint128)a[ 6]) * b[10]
+                 + ((sp_uint128)a[ 7]) * b[ 9]
+                 + ((sp_uint128)a[ 8]) * b[ 8]
+                 + ((sp_uint128)a[ 9]) * b[ 7]
+                 + ((sp_uint128)a[10]) * b[ 6]
+                 + ((sp_uint128)a[11]) * b[ 5]
+                 + ((sp_uint128)a[12]) * b[ 4];
+    sp_uint128 t17  = ((sp_uint128)a[ 5]) * b[12]
+                 + ((sp_uint128)a[ 6]) * b[11]
+                 + ((sp_uint128)a[ 7]) * b[10]
+                 + ((sp_uint128)a[ 8]) * b[ 9]
+                 + ((sp_uint128)a[ 9]) * b[ 8]
+                 + ((sp_uint128)a[10]) * b[ 7]
+                 + ((sp_uint128)a[11]) * b[ 6]
+                 + ((sp_uint128)a[12]) * b[ 5];
+    sp_uint128 t18  = ((sp_uint128)a[ 6]) * b[12]
+                 + ((sp_uint128)a[ 7]) * b[11]
+                 + ((sp_uint128)a[ 8]) * b[10]
+                 + ((sp_uint128)a[ 9]) * b[ 9]
+                 + ((sp_uint128)a[10]) * b[ 8]
+                 + ((sp_uint128)a[11]) * b[ 7]
+                 + ((sp_uint128)a[12]) * b[ 6];
+    sp_uint128 t19  = ((sp_uint128)a[ 7]) * b[12]
+                 + ((sp_uint128)a[ 8]) * b[11]
+                 + ((sp_uint128)a[ 9]) * b[10]
+                 + ((sp_uint128)a[10]) * b[ 9]
+                 + ((sp_uint128)a[11]) * b[ 8]
+                 + ((sp_uint128)a[12]) * b[ 7];
+    sp_uint128 t20  = ((sp_uint128)a[ 8]) * b[12]
+                 + ((sp_uint128)a[ 9]) * b[11]
+                 + ((sp_uint128)a[10]) * b[10]
+                 + ((sp_uint128)a[11]) * b[ 9]
+                 + ((sp_uint128)a[12]) * b[ 8];
+    sp_uint128 t21  = ((sp_uint128)a[ 9]) * b[12]
+                 + ((sp_uint128)a[10]) * b[11]
+                 + ((sp_uint128)a[11]) * b[10]
+                 + ((sp_uint128)a[12]) * b[ 9];
+    sp_uint128 t22  = ((sp_uint128)a[10]) * b[12]
+                 + ((sp_uint128)a[11]) * b[11]
+                 + ((sp_uint128)a[12]) * b[10];
+    sp_uint128 t23  = ((sp_uint128)a[11]) * b[12]
+                 + ((sp_uint128)a[12]) * b[11];
+    sp_uint128 t24  = ((sp_uint128)a[12]) * b[12];
 
     t1   += t0  >> 53; r[ 0] = t0  & 0x1fffffffffffffL;
     t2   += t1  >> 53; r[ 1] = t1  & 0x1fffffffffffffL;
@@ -8701,97 +17989,97 @@ SP_NOINLINE static void sp_4096_mul_13(sp_digit* r, const sp_digit* a,
  */
 SP_NOINLINE static void sp_4096_sqr_13(sp_digit* r, const sp_digit* a)
 {
-    int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
-    int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
-    int128_t t2   = (((int128_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int128_t)a[ 1]) * a[ 1];
-    int128_t t3   = (((int128_t)a[ 0]) * a[ 3]
-                 +  ((int128_t)a[ 1]) * a[ 2]) * 2;
-    int128_t t4   = (((int128_t)a[ 0]) * a[ 4]
-                 +  ((int128_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int128_t)a[ 2]) * a[ 2];
-    int128_t t5   = (((int128_t)a[ 0]) * a[ 5]
-                 +  ((int128_t)a[ 1]) * a[ 4]
-                 +  ((int128_t)a[ 2]) * a[ 3]) * 2;
-    int128_t t6   = (((int128_t)a[ 0]) * a[ 6]
-                 +  ((int128_t)a[ 1]) * a[ 5]
-                 +  ((int128_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int128_t)a[ 3]) * a[ 3];
-    int128_t t7   = (((int128_t)a[ 0]) * a[ 7]
-                 +  ((int128_t)a[ 1]) * a[ 6]
-                 +  ((int128_t)a[ 2]) * a[ 5]
-                 +  ((int128_t)a[ 3]) * a[ 4]) * 2;
-    int128_t t8   = (((int128_t)a[ 0]) * a[ 8]
-                 +  ((int128_t)a[ 1]) * a[ 7]
-                 +  ((int128_t)a[ 2]) * a[ 6]
-                 +  ((int128_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int128_t)a[ 4]) * a[ 4];
-    int128_t t9   = (((int128_t)a[ 0]) * a[ 9]
-                 +  ((int128_t)a[ 1]) * a[ 8]
-                 +  ((int128_t)a[ 2]) * a[ 7]
-                 +  ((int128_t)a[ 3]) * a[ 6]
-                 +  ((int128_t)a[ 4]) * a[ 5]) * 2;
-    int128_t t10  = (((int128_t)a[ 0]) * a[10]
-                 +  ((int128_t)a[ 1]) * a[ 9]
-                 +  ((int128_t)a[ 2]) * a[ 8]
-                 +  ((int128_t)a[ 3]) * a[ 7]
-                 +  ((int128_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int128_t)a[ 5]) * a[ 5];
-    int128_t t11  = (((int128_t)a[ 0]) * a[11]
-                 +  ((int128_t)a[ 1]) * a[10]
-                 +  ((int128_t)a[ 2]) * a[ 9]
-                 +  ((int128_t)a[ 3]) * a[ 8]
-                 +  ((int128_t)a[ 4]) * a[ 7]
-                 +  ((int128_t)a[ 5]) * a[ 6]) * 2;
-    int128_t t12  = (((int128_t)a[ 0]) * a[12]
-                 +  ((int128_t)a[ 1]) * a[11]
-                 +  ((int128_t)a[ 2]) * a[10]
-                 +  ((int128_t)a[ 3]) * a[ 9]
-                 +  ((int128_t)a[ 4]) * a[ 8]
-                 +  ((int128_t)a[ 5]) * a[ 7]) * 2
-                 +  ((int128_t)a[ 6]) * a[ 6];
-    int128_t t13  = (((int128_t)a[ 1]) * a[12]
-                 +  ((int128_t)a[ 2]) * a[11]
-                 +  ((int128_t)a[ 3]) * a[10]
-                 +  ((int128_t)a[ 4]) * a[ 9]
-                 +  ((int128_t)a[ 5]) * a[ 8]
-                 +  ((int128_t)a[ 6]) * a[ 7]) * 2;
-    int128_t t14  = (((int128_t)a[ 2]) * a[12]
-                 +  ((int128_t)a[ 3]) * a[11]
-                 +  ((int128_t)a[ 4]) * a[10]
-                 +  ((int128_t)a[ 5]) * a[ 9]
-                 +  ((int128_t)a[ 6]) * a[ 8]) * 2
-                 +  ((int128_t)a[ 7]) * a[ 7];
-    int128_t t15  = (((int128_t)a[ 3]) * a[12]
-                 +  ((int128_t)a[ 4]) * a[11]
-                 +  ((int128_t)a[ 5]) * a[10]
-                 +  ((int128_t)a[ 6]) * a[ 9]
-                 +  ((int128_t)a[ 7]) * a[ 8]) * 2;
-    int128_t t16  = (((int128_t)a[ 4]) * a[12]
-                 +  ((int128_t)a[ 5]) * a[11]
-                 +  ((int128_t)a[ 6]) * a[10]
-                 +  ((int128_t)a[ 7]) * a[ 9]) * 2
-                 +  ((int128_t)a[ 8]) * a[ 8];
-    int128_t t17  = (((int128_t)a[ 5]) * a[12]
-                 +  ((int128_t)a[ 6]) * a[11]
-                 +  ((int128_t)a[ 7]) * a[10]
-                 +  ((int128_t)a[ 8]) * a[ 9]) * 2;
-    int128_t t18  = (((int128_t)a[ 6]) * a[12]
-                 +  ((int128_t)a[ 7]) * a[11]
-                 +  ((int128_t)a[ 8]) * a[10]) * 2
-                 +  ((int128_t)a[ 9]) * a[ 9];
-    int128_t t19  = (((int128_t)a[ 7]) * a[12]
-                 +  ((int128_t)a[ 8]) * a[11]
-                 +  ((int128_t)a[ 9]) * a[10]) * 2;
-    int128_t t20  = (((int128_t)a[ 8]) * a[12]
-                 +  ((int128_t)a[ 9]) * a[11]) * 2
-                 +  ((int128_t)a[10]) * a[10];
-    int128_t t21  = (((int128_t)a[ 9]) * a[12]
-                 +  ((int128_t)a[10]) * a[11]) * 2;
-    int128_t t22  = (((int128_t)a[10]) * a[12]) * 2
-                 +  ((int128_t)a[11]) * a[11];
-    int128_t t23  = (((int128_t)a[11]) * a[12]) * 2;
-    int128_t t24  =  ((int128_t)a[12]) * a[12];
+    sp_uint128 t0   =  ((sp_uint128)a[ 0]) * a[ 0];
+    sp_uint128 t1   = (((sp_uint128)a[ 0]) * a[ 1]) * 2;
+    sp_uint128 t2   = (((sp_uint128)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_uint128)a[ 1]) * a[ 1];
+    sp_uint128 t3   = (((sp_uint128)a[ 0]) * a[ 3]
+                 +  ((sp_uint128)a[ 1]) * a[ 2]) * 2;
+    sp_uint128 t4   = (((sp_uint128)a[ 0]) * a[ 4]
+                 +  ((sp_uint128)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_uint128)a[ 2]) * a[ 2];
+    sp_uint128 t5   = (((sp_uint128)a[ 0]) * a[ 5]
+                 +  ((sp_uint128)a[ 1]) * a[ 4]
+                 +  ((sp_uint128)a[ 2]) * a[ 3]) * 2;
+    sp_uint128 t6   = (((sp_uint128)a[ 0]) * a[ 6]
+                 +  ((sp_uint128)a[ 1]) * a[ 5]
+                 +  ((sp_uint128)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_uint128)a[ 3]) * a[ 3];
+    sp_uint128 t7   = (((sp_uint128)a[ 0]) * a[ 7]
+                 +  ((sp_uint128)a[ 1]) * a[ 6]
+                 +  ((sp_uint128)a[ 2]) * a[ 5]
+                 +  ((sp_uint128)a[ 3]) * a[ 4]) * 2;
+    sp_uint128 t8   = (((sp_uint128)a[ 0]) * a[ 8]
+                 +  ((sp_uint128)a[ 1]) * a[ 7]
+                 +  ((sp_uint128)a[ 2]) * a[ 6]
+                 +  ((sp_uint128)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_uint128)a[ 4]) * a[ 4];
+    sp_uint128 t9   = (((sp_uint128)a[ 0]) * a[ 9]
+                 +  ((sp_uint128)a[ 1]) * a[ 8]
+                 +  ((sp_uint128)a[ 2]) * a[ 7]
+                 +  ((sp_uint128)a[ 3]) * a[ 6]
+                 +  ((sp_uint128)a[ 4]) * a[ 5]) * 2;
+    sp_uint128 t10  = (((sp_uint128)a[ 0]) * a[10]
+                 +  ((sp_uint128)a[ 1]) * a[ 9]
+                 +  ((sp_uint128)a[ 2]) * a[ 8]
+                 +  ((sp_uint128)a[ 3]) * a[ 7]
+                 +  ((sp_uint128)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_uint128)a[ 5]) * a[ 5];
+    sp_uint128 t11  = (((sp_uint128)a[ 0]) * a[11]
+                 +  ((sp_uint128)a[ 1]) * a[10]
+                 +  ((sp_uint128)a[ 2]) * a[ 9]
+                 +  ((sp_uint128)a[ 3]) * a[ 8]
+                 +  ((sp_uint128)a[ 4]) * a[ 7]
+                 +  ((sp_uint128)a[ 5]) * a[ 6]) * 2;
+    sp_uint128 t12  = (((sp_uint128)a[ 0]) * a[12]
+                 +  ((sp_uint128)a[ 1]) * a[11]
+                 +  ((sp_uint128)a[ 2]) * a[10]
+                 +  ((sp_uint128)a[ 3]) * a[ 9]
+                 +  ((sp_uint128)a[ 4]) * a[ 8]
+                 +  ((sp_uint128)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_uint128)a[ 6]) * a[ 6];
+    sp_uint128 t13  = (((sp_uint128)a[ 1]) * a[12]
+                 +  ((sp_uint128)a[ 2]) * a[11]
+                 +  ((sp_uint128)a[ 3]) * a[10]
+                 +  ((sp_uint128)a[ 4]) * a[ 9]
+                 +  ((sp_uint128)a[ 5]) * a[ 8]
+                 +  ((sp_uint128)a[ 6]) * a[ 7]) * 2;
+    sp_uint128 t14  = (((sp_uint128)a[ 2]) * a[12]
+                 +  ((sp_uint128)a[ 3]) * a[11]
+                 +  ((sp_uint128)a[ 4]) * a[10]
+                 +  ((sp_uint128)a[ 5]) * a[ 9]
+                 +  ((sp_uint128)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_uint128)a[ 7]) * a[ 7];
+    sp_uint128 t15  = (((sp_uint128)a[ 3]) * a[12]
+                 +  ((sp_uint128)a[ 4]) * a[11]
+                 +  ((sp_uint128)a[ 5]) * a[10]
+                 +  ((sp_uint128)a[ 6]) * a[ 9]
+                 +  ((sp_uint128)a[ 7]) * a[ 8]) * 2;
+    sp_uint128 t16  = (((sp_uint128)a[ 4]) * a[12]
+                 +  ((sp_uint128)a[ 5]) * a[11]
+                 +  ((sp_uint128)a[ 6]) * a[10]
+                 +  ((sp_uint128)a[ 7]) * a[ 9]) * 2
+                 +  ((sp_uint128)a[ 8]) * a[ 8];
+    sp_uint128 t17  = (((sp_uint128)a[ 5]) * a[12]
+                 +  ((sp_uint128)a[ 6]) * a[11]
+                 +  ((sp_uint128)a[ 7]) * a[10]
+                 +  ((sp_uint128)a[ 8]) * a[ 9]) * 2;
+    sp_uint128 t18  = (((sp_uint128)a[ 6]) * a[12]
+                 +  ((sp_uint128)a[ 7]) * a[11]
+                 +  ((sp_uint128)a[ 8]) * a[10]) * 2
+                 +  ((sp_uint128)a[ 9]) * a[ 9];
+    sp_uint128 t19  = (((sp_uint128)a[ 7]) * a[12]
+                 +  ((sp_uint128)a[ 8]) * a[11]
+                 +  ((sp_uint128)a[ 9]) * a[10]) * 2;
+    sp_uint128 t20  = (((sp_uint128)a[ 8]) * a[12]
+                 +  ((sp_uint128)a[ 9]) * a[11]) * 2
+                 +  ((sp_uint128)a[10]) * a[10];
+    sp_uint128 t21  = (((sp_uint128)a[ 9]) * a[12]
+                 +  ((sp_uint128)a[10]) * a[11]) * 2;
+    sp_uint128 t22  = (((sp_uint128)a[10]) * a[12]) * 2
+                 +  ((sp_uint128)a[11]) * a[11];
+    sp_uint128 t23  = (((sp_uint128)a[11]) * a[12]) * 2;
+    sp_uint128 t24  =  ((sp_uint128)a[12]) * a[12];
 
     t1   += t0  >> 53; r[ 0] = t0  & 0x1fffffffffffffL;
     t2   += t1  >> 53; r[ 1] = t1  & 0x1fffffffffffffL;
@@ -9132,162 +18420,76 @@ SP_NOINLINE static void sp_4096_sqr_78(sp_digit* r, const sp_digit* a)
 }
 
 #endif /* !WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
+/* Caclulate the bottom digit of -1/a mod 2^n.
+ *
+ * a    A single precision number.
+ * rho  Bottom word of inverse.
+ */
+static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
+{
+    sp_digit x;
+    sp_digit b;
+
+    b = a[0];
+    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+    x &= 0x1fffffffffffffL;
+
+    /* rho = -1/m mod b */
+    *rho = ((sp_digit)1 << 53) - x;
+}
+
+/* Multiply a by scalar b into r. (r = a * b)
  *
  * r  A single precision integer.
  * a  A single precision integer.
- * b  A single precision integer.
+ * b  A scalar.
  */
-SP_NOINLINE static int sp_4096_add_78(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
+SP_NOINLINE static void sp_4096_mul_d_78(sp_digit* r, const sp_digit* a,
+    sp_digit b)
 {
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    sp_digit t2;
+    sp_int128 p[4];
     int i;
 
-    for (i = 0; i < 78; i++) {
-        r[i] = a[i] + b[i];
+    for (i = 0; i < 76; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0x1fffffffffffffL);
+        t >>= 53;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0x1fffffffffffffL);
+        t >>= 53;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0x1fffffffffffffL);
+        t >>= 53;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0x1fffffffffffffL);
+        t >>= 53;
+        r[i + 3] = (sp_digit)t2;
     }
-
-    return 0;
-}
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_sub_78(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 78; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
+    t += tb * a[76];
+    r[76] = (sp_digit)(t & 0x1fffffffffffffL);
+    t >>= 53;
+    t += tb * a[77];
+    r[77] = (sp_digit)(t & 0x1fffffffffffffL);
+    t >>= 53;
+    r[78] = (sp_digit)(t & 0x1fffffffffffffL);
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_mul_78(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[77]) * b[77];
-    r[155] = (sp_digit)(c >> 53);
-    c = (c & 0x1fffffffffffffL) << 53;
-    for (k = 153; k >= 0; k--) {
-        for (i = 77; i >= 0; i--) {
-            j = k - i;
-            if (j >= 78) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
-        }
-        r[k + 2] += (sp_digit)(c >> 106);
-        r[k + 1] = (sp_digit)((c >> 53) & 0x1fffffffffffffL);
-        c = (c & 0x1fffffffffffffL) << 53;
-    }
-    r[0] = (sp_digit)(c >> 53);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_sqr_78(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[77]) * a[77];
-    r[155] = (sp_digit)(c >> 53);
-    c = (c & 0x1fffffffffffffL) << 53;
-    for (k = 153; k >= 0; k--) {
-        for (i = 77; i >= 0; i--) {
-            j = k - i;
-            if (j >= 78 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
-        }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 106);
-        r[k + 1] = (sp_digit)((c >> 53) & 0x1fffffffffffffL);
-        c = (c & 0x1fffffffffffffL) << 53;
-    }
-    r[0] = (sp_digit)(c >> 53);
-}
-
-#endif /* WOLFSSL_SP_SMALL */
 #if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
 #if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
-#ifdef WOLFSSL_SP_SMALL
-/* Add b to a into r. (r = a + b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_add_39(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 39; i++) {
-        r[i] = a[i] + b[i];
-    }
-
-    return 0;
-}
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Sub b from a into r. (r = a - b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static int sp_4096_sub_39(sp_digit* r, const sp_digit* a,
-        const sp_digit* b)
-{
-    int i;
-
-    for (i = 0; i < 39; i++) {
-        r[i] = a[i] - b[i];
-    }
-
-    return 0;
-}
-
-#else
 /* Sub b from a into r. (r = a - b)
  *
  * r  A single precision integer.
@@ -9320,169 +18522,6 @@ SP_NOINLINE static int sp_4096_sub_39(sp_digit* r, const sp_digit* a,
     return 0;
 }
 
-#endif /* WOLFSSL_SP_SMALL */
-#ifdef WOLFSSL_SP_SMALL
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_mul_39(sp_digit* r, const sp_digit* a,
-    const sp_digit* b)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[38]) * b[38];
-    r[77] = (sp_digit)(c >> 53);
-    c = (c & 0x1fffffffffffffL) << 53;
-    for (k = 75; k >= 0; k--) {
-        for (i = 38; i >= 0; i--) {
-            j = k - i;
-            if (j >= 39) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
-        }
-        r[k + 2] += (sp_digit)(c >> 106);
-        r[k + 1] = (sp_digit)((c >> 53) & 0x1fffffffffffffL);
-        c = (c & 0x1fffffffffffffL) << 53;
-    }
-    r[0] = (sp_digit)(c >> 53);
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- */
-SP_NOINLINE static void sp_4096_sqr_39(sp_digit* r, const sp_digit* a)
-{
-    int i;
-    int j;
-    int k;
-    int128_t c;
-
-    c = ((int128_t)a[38]) * a[38];
-    r[77] = (sp_digit)(c >> 53);
-    c = (c & 0x1fffffffffffffL) << 53;
-    for (k = 75; k >= 0; k--) {
-        for (i = 38; i >= 0; i--) {
-            j = k - i;
-            if (j >= 39 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
-        }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
-        }
-
-        r[k + 2] += (sp_digit)(c >> 106);
-        r[k + 1] = (sp_digit)((c >> 53) & 0x1fffffffffffffL);
-        c = (c & 0x1fffffffffffffL) << 53;
-    }
-    r[0] = (sp_digit)(c >> 53);
-}
-
-#endif /* WOLFSSL_SP_SMALL */
-#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
-#endif /* (WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH) & !WOLFSSL_RSA_PUBLIC_ONLY */
-
-/* Caclulate the bottom digit of -1/a mod 2^n.
- *
- * a    A single precision number.
- * rho  Bottom word of inverse.
- */
-static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
-{
-    sp_digit x;
-    sp_digit b;
-
-    b = a[0];
-    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
-    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
-    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
-    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
-    x &= 0x1fffffffffffffL;
-
-    /* rho = -1/m mod b */
-    *rho = ((sp_digit)1 << 53) - x;
-}
-
-/* Multiply a by scalar b into r. (r = a * b)
- *
- * r  A single precision integer.
- * a  A single precision integer.
- * b  A scalar.
- */
-SP_NOINLINE static void sp_4096_mul_d_78(sp_digit* r, const sp_digit* a,
-    sp_digit b)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 78; i++) {
-        t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1fffffffffffffL);
-        t >>= 53;
-    }
-    r[78] = (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t = 0;
-    sp_digit t2;
-    int128_t p[4];
-    int i;
-
-    for (i = 0; i < 76; i += 4) {
-        p[0] = tb * a[i + 0];
-        p[1] = tb * a[i + 1];
-        p[2] = tb * a[i + 2];
-        p[3] = tb * a[i + 3];
-        t += p[0];
-        t2 = (sp_digit)(t & 0x1fffffffffffffL);
-        t >>= 53;
-        r[i + 0] = (sp_digit)t2;
-        t += p[1];
-        t2 = (sp_digit)(t & 0x1fffffffffffffL);
-        t >>= 53;
-        r[i + 1] = (sp_digit)t2;
-        t += p[2];
-        t2 = (sp_digit)(t & 0x1fffffffffffffL);
-        t >>= 53;
-        r[i + 2] = (sp_digit)t2;
-        t += p[3];
-        t2 = (sp_digit)(t & 0x1fffffffffffffL);
-        t >>= 53;
-        r[i + 3] = (sp_digit)t2;
-    }
-    t += tb * a[76];
-    r[76] = (sp_digit)(t & 0x1fffffffffffffL);
-    t >>= 53;
-    t += tb * a[77];
-    r[77] = (sp_digit)(t & 0x1fffffffffffffL);
-    t >>= 53;
-    r[78] = (sp_digit)(t & 0x1fffffffffffffL);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-#if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
-#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
 /* r = 2^n mod m where n is the number of bits to reduce by.
  * Given m must be 4096 bits, just need to subtract.
  *
@@ -9492,13 +18531,6 @@ SP_NOINLINE static void sp_4096_mul_d_78(sp_digit* r, const sp_digit* a,
 static void sp_4096_mont_norm_39(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=0; i<38; i++) {
-        r[i] = 0x1fffffffffffffL;
-    }
-#else
     int i;
 
     for (i = 0; i < 32; i += 8) {
@@ -9517,7 +18549,6 @@ static void sp_4096_mont_norm_39(sp_digit* r, const sp_digit* m)
     r[35] = 0x1fffffffffffffL;
     r[36] = 0x1fffffffffffffL;
     r[37] = 0x1fffffffffffffL;
-#endif
     r[38] = 0x3ffffffffL;
 
     /* r = (2^n - 1) mod n */
@@ -9537,13 +18568,6 @@ static void sp_4096_mont_norm_39(sp_digit* r, const sp_digit* m)
 static sp_digit sp_4096_cmp_39(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=38; i>=0; i--) {
-        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#else
     int i;
 
     r |= (a[38] - b[38]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
@@ -9563,7 +18587,6 @@ static sp_digit sp_4096_cmp_39(const sp_digit* a, const sp_digit* b)
         r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -9579,13 +18602,6 @@ static sp_digit sp_4096_cmp_39(const sp_digit* a, const sp_digit* b)
 static void sp_4096_cond_sub_39(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 39; i++) {
-        r[i] = a[i] - (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 32; i += 8) {
@@ -9605,7 +18621,6 @@ static void sp_4096_cond_sub_39(sp_digit* r, const sp_digit* a,
     r[36] = a[36] - (b[36] & m);
     r[37] = a[37] - (b[37] & m);
     r[38] = a[38] - (b[38] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -9617,20 +18632,8 @@ static void sp_4096_cond_sub_39(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_4096_mul_add_39(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 39; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1fffffffffffffL;
-        t >>= 53;
-    }
-    r[39] += (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t[8];
+    sp_int128 tb = b;
+    sp_int128 t[8];
     int i;
 
     t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1fffffffffffffL);
@@ -9665,40 +18668,6 @@ SP_NOINLINE static void sp_4096_mul_add_39(sp_digit* r, const sp_digit* a,
     t[6] = tb * a[38];
     r[38] += (sp_digit)((t[5] >> 53) + (t[6] & 0x1fffffffffffffL));
     r[39] +=  (sp_digit)(t[6] >> 53);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 53.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_4096_norm_39(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 38; i++) {
-        a[i+1] += a[i] >> 53;
-        a[i] &= 0x1fffffffffffffL;
-    }
-#else
-    int i;
-    for (i = 0; i < 32; i += 8) {
-        a[i+1] += a[i+0] >> 53; a[i+0] &= 0x1fffffffffffffL;
-        a[i+2] += a[i+1] >> 53; a[i+1] &= 0x1fffffffffffffL;
-        a[i+3] += a[i+2] >> 53; a[i+2] &= 0x1fffffffffffffL;
-        a[i+4] += a[i+3] >> 53; a[i+3] &= 0x1fffffffffffffL;
-        a[i+5] += a[i+4] >> 53; a[i+4] &= 0x1fffffffffffffL;
-        a[i+6] += a[i+5] >> 53; a[i+5] &= 0x1fffffffffffffL;
-        a[i+7] += a[i+6] >> 53; a[i+6] &= 0x1fffffffffffffL;
-        a[i+8] += a[i+7] >> 53; a[i+7] &= 0x1fffffffffffffL;
-    }
-    a[32+1] += a[32] >> 53; a[32] &= 0x1fffffffffffffL;
-    a[33+1] += a[33] >> 53; a[33] &= 0x1fffffffffffffL;
-    a[34+1] += a[34] >> 53; a[34] &= 0x1fffffffffffffL;
-    a[35+1] += a[35] >> 53; a[35] &= 0x1fffffffffffffL;
-    a[36+1] += a[36] >> 53; a[36] &= 0x1fffffffffffffL;
-    a[37+1] += a[37] >> 53; a[37] &= 0x1fffffffffffffL;
-#endif
 }
 
 /* Shift the result in the high 2048 bits down to the bottom.
@@ -9708,47 +18677,34 @@ static void sp_4096_norm_39(sp_digit* a)
  */
 static void sp_4096_mont_shift_39(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
-    int128_t n = a[38] >> 34;
-    n += ((int128_t)a[39]) << 19;
-
-    for (i = 0; i < 38; i++) {
-        r[i] = n & 0x1fffffffffffffL;
-        n >>= 53;
-        n += ((int128_t)a[40 + i]) << 19;
-    }
-    r[38] = (sp_digit)n;
-#else
-    int i;
-    int128_t n = a[38] >> 34;
-    n += ((int128_t)a[39]) << 19;
+    sp_int128 n = a[38] >> 34;
+    n += ((sp_int128)a[39]) << 19;
     for (i = 0; i < 32; i += 8) {
         r[i + 0] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 40]) << 19;
+        n >>= 53; n += ((sp_int128)a[i + 40]) << 19;
         r[i + 1] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 41]) << 19;
+        n >>= 53; n += ((sp_int128)a[i + 41]) << 19;
         r[i + 2] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 42]) << 19;
+        n >>= 53; n += ((sp_int128)a[i + 42]) << 19;
         r[i + 3] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 43]) << 19;
+        n >>= 53; n += ((sp_int128)a[i + 43]) << 19;
         r[i + 4] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 44]) << 19;
+        n >>= 53; n += ((sp_int128)a[i + 44]) << 19;
         r[i + 5] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 45]) << 19;
+        n >>= 53; n += ((sp_int128)a[i + 45]) << 19;
         r[i + 6] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 46]) << 19;
+        n >>= 53; n += ((sp_int128)a[i + 46]) << 19;
         r[i + 7] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 47]) << 19;
+        n >>= 53; n += ((sp_int128)a[i + 47]) << 19;
     }
-    r[32] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[72]) << 19;
-    r[33] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[73]) << 19;
-    r[34] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[74]) << 19;
-    r[35] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[75]) << 19;
-    r[36] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[76]) << 19;
-    r[37] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[77]) << 19;
+    r[32] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[72]) << 19;
+    r[33] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[73]) << 19;
+    r[34] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[74]) << 19;
+    r[35] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[75]) << 19;
+    r[36] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[76]) << 19;
+    r[37] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[77]) << 19;
     r[38] = (sp_digit)n;
-#endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[39], 0, sizeof(*r) * 39U);
 }
 
@@ -9775,7 +18731,7 @@ static void sp_4096_mont_reduce_39(sp_digit* a, const sp_digit* m, sp_digit mp)
     a[i+1] += a[i] >> 53;
     a[i] &= 0x1fffffffffffffL;
     sp_4096_mont_shift_39(a, a);
-    sp_4096_cond_sub_39(a, a, m, 0 - (((a[38] >> 34) > 0) ?
+    sp_4096_cond_sub_39(a, a, m, 0 - (((a[38] - m[38]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
     sp_4096_norm_39(a);
 }
@@ -9819,22 +18775,10 @@ static void sp_4096_mont_sqr_39(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_4096_mul_d_39(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 39; i++) {
-        t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1fffffffffffffL);
-        t >>= 53;
-    }
-    r[39] = (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     sp_digit t2;
-    int128_t p[4];
+    sp_int128 p[4];
     int i;
 
     for (i = 0; i < 36; i += 4) {
@@ -9869,7 +18813,6 @@ SP_NOINLINE static void sp_4096_mul_d_39(sp_digit* r, const sp_digit* a,
     r[38] = (sp_digit)(t & 0x1fffffffffffffL);
     t >>= 53;
     r[39] = (sp_digit)(t & 0x1fffffffffffffL);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Conditionally add a and b using the mask m.
@@ -9883,13 +18826,6 @@ SP_NOINLINE static void sp_4096_mul_d_39(sp_digit* r, const sp_digit* a,
 static void sp_4096_cond_add_39(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 39; i++) {
-        r[i] = a[i] + (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 32; i += 8) {
@@ -9909,7 +18845,6 @@ static void sp_4096_cond_add_39(sp_digit* r, const sp_digit* a,
     r[36] = a[36] + (b[36] & m);
     r[37] = a[37] + (b[37] & m);
     r[38] = a[38] + (b[38] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 SP_NOINLINE static void sp_4096_rshift_39(sp_digit* r, const sp_digit* a,
@@ -9917,11 +18852,6 @@ SP_NOINLINE static void sp_4096_rshift_39(sp_digit* r, const sp_digit* a,
 {
     int i;
 
-#ifdef WOLFSSL_SP_SMALL
-    for (i=0; i<38; i++) {
-        r[i] = ((a[i] >> n) | (a[i + 1] << (53 - n))) & 0x1fffffffffffffL;
-    }
-#else
     for (i=0; i<32; i += 8) {
         r[i+0] = (a[i+0] >> n) | ((a[i+1] << (53 - n)) & 0x1fffffffffffffL);
         r[i+1] = (a[i+1] >> n) | ((a[i+2] << (53 - n)) & 0x1fffffffffffffL);
@@ -9938,7 +18868,6 @@ SP_NOINLINE static void sp_4096_rshift_39(sp_digit* r, const sp_digit* a,
     r[35] = (a[35] >> n) | ((a[36] << (53 - n)) & 0x1fffffffffffffL);
     r[36] = (a[36] >> n) | ((a[37] << (53 - n)) & 0x1fffffffffffffL);
     r[37] = (a[37] >> n) | ((a[38] << (53 - n)) & 0x1fffffffffffffL);
-#endif
     r[38] = a[38] >> n;
 }
 
@@ -10015,7 +18944,7 @@ static int sp_4096_div_39(const sp_digit* a, const sp_digit* d,
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
+    sp_int128 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
@@ -10064,7 +18993,14 @@ static int sp_4096_div_39(const sp_digit* a, const sp_digit* d,
             t1[39 + i] -= t2[39];
             t1[39 + i] += t1[39 + i - 1] >> 53;
             t1[39 + i - 1] &= 0x1fffffffffffffL;
-            r1 = (((-t1[39 + i]) << 53) - t1[39 + i - 1]) / dv;
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[39 + i];
+            d1 <<= 53;
+            d1 -= t1[39 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_39(-t1[39 + i], -t1[39 + i - 1], dv);
+#endif
             r1 -= t1[39 + i];
             sp_4096_mul_d_39(t2, sd, r1);
             (void)sp_4096_add_39(&t1[i], &t1[i], t2);
@@ -10391,13 +19327,25 @@ static int sp_4096_mod_exp_39(sp_digit* r, const sp_digit* a, const sp_digit* e,
         c -= 5;
         XMEMCPY(rt, t[y], sizeof(sp_digit) * 78);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (11 - c);
-                c += 53;
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 11;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 48;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 11;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 53 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_4096_mont_sqr_39(rt, rt, m, mp);
             sp_4096_mont_sqr_39(rt, rt, m, mp);
@@ -10436,13 +19384,6 @@ static int sp_4096_mod_exp_39(sp_digit* r, const sp_digit* a, const sp_digit* e,
 static void sp_4096_mont_norm_78(sp_digit* r, const sp_digit* m)
 {
     /* Set r = 2^n - 1. */
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=0; i<77; i++) {
-        r[i] = 0x1fffffffffffffL;
-    }
-#else
     int i;
 
     for (i = 0; i < 72; i += 8) {
@@ -10460,7 +19401,6 @@ static void sp_4096_mont_norm_78(sp_digit* r, const sp_digit* m)
     r[74] = 0x1fffffffffffffL;
     r[75] = 0x1fffffffffffffL;
     r[76] = 0x1fffffffffffffL;
-#endif
     r[77] = 0x7fffL;
 
     /* r = (2^n - 1) mod n */
@@ -10480,13 +19420,6 @@ static void sp_4096_mont_norm_78(sp_digit* r, const sp_digit* m)
 static sp_digit sp_4096_cmp_78(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i=77; i>=0; i--) {
-        r |= (a[i] - b[i]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
-    }
-#else
     int i;
 
     r |= (a[77] - b[77]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
@@ -10505,7 +19438,6 @@ static sp_digit sp_4096_cmp_78(const sp_digit* a, const sp_digit* b)
         r |= (a[i + 1] - b[i + 1]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
         r |= (a[i + 0] - b[i + 0]) & (0 - (sp_digit)((r == 0) ? 1 : 0));
     }
-#endif /* WOLFSSL_SP_SMALL */
 
     return r;
 }
@@ -10521,13 +19453,6 @@ static sp_digit sp_4096_cmp_78(const sp_digit* a, const sp_digit* b)
 static void sp_4096_cond_sub_78(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 78; i++) {
-        r[i] = a[i] - (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 72; i += 8) {
@@ -10546,7 +19471,6 @@ static void sp_4096_cond_sub_78(sp_digit* r, const sp_digit* a,
     r[75] = a[75] - (b[75] & m);
     r[76] = a[76] - (b[76] & m);
     r[77] = a[77] - (b[77] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Mul a by scalar b and add into r. (r += a * b)
@@ -10558,20 +19482,8 @@ static void sp_4096_cond_sub_78(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_4096_mul_add_78(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 78; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1fffffffffffffL;
-        t >>= 53;
-    }
-    r[78] += (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t[8];
+    sp_int128 tb = b;
+    sp_int128 t[8];
     int i;
 
     t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1fffffffffffffL);
@@ -10604,39 +19516,6 @@ SP_NOINLINE static void sp_4096_mul_add_78(sp_digit* r, const sp_digit* a,
     t[5] = tb * a[77];
     r[77] += (sp_digit)((t[4] >> 53) + (t[5] & 0x1fffffffffffffL));
     r[78] +=  (sp_digit)(t[5] >> 53);
-#endif /* WOLFSSL_SP_SMALL */
-}
-
-/* Normalize the values in each word to 53.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_4096_norm_78(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 77; i++) {
-        a[i+1] += a[i] >> 53;
-        a[i] &= 0x1fffffffffffffL;
-    }
-#else
-    int i;
-    for (i = 0; i < 72; i += 8) {
-        a[i+1] += a[i+0] >> 53; a[i+0] &= 0x1fffffffffffffL;
-        a[i+2] += a[i+1] >> 53; a[i+1] &= 0x1fffffffffffffL;
-        a[i+3] += a[i+2] >> 53; a[i+2] &= 0x1fffffffffffffL;
-        a[i+4] += a[i+3] >> 53; a[i+3] &= 0x1fffffffffffffL;
-        a[i+5] += a[i+4] >> 53; a[i+4] &= 0x1fffffffffffffL;
-        a[i+6] += a[i+5] >> 53; a[i+5] &= 0x1fffffffffffffL;
-        a[i+7] += a[i+6] >> 53; a[i+6] &= 0x1fffffffffffffL;
-        a[i+8] += a[i+7] >> 53; a[i+7] &= 0x1fffffffffffffL;
-    }
-    a[72+1] += a[72] >> 53; a[72] &= 0x1fffffffffffffL;
-    a[73+1] += a[73] >> 53; a[73] &= 0x1fffffffffffffL;
-    a[74+1] += a[74] >> 53; a[74] &= 0x1fffffffffffffL;
-    a[75+1] += a[75] >> 53; a[75] &= 0x1fffffffffffffL;
-    a[76+1] += a[76] >> 53; a[76] &= 0x1fffffffffffffL;
-#endif
 }
 
 /* Shift the result in the high 4096 bits down to the bottom.
@@ -10646,46 +19525,33 @@ static void sp_4096_norm_78(sp_digit* a)
  */
 static void sp_4096_mont_shift_78(sp_digit* r, const sp_digit* a)
 {
-#ifdef WOLFSSL_SP_SMALL
     int i;
-    int128_t n = a[77] >> 15;
-    n += ((int128_t)a[78]) << 38;
-
-    for (i = 0; i < 77; i++) {
-        r[i] = n & 0x1fffffffffffffL;
-        n >>= 53;
-        n += ((int128_t)a[79 + i]) << 38;
-    }
-    r[77] = (sp_digit)n;
-#else
-    int i;
-    int128_t n = a[77] >> 15;
-    n += ((int128_t)a[78]) << 38;
+    sp_int128 n = a[77] >> 15;
+    n += ((sp_int128)a[78]) << 38;
     for (i = 0; i < 72; i += 8) {
         r[i + 0] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 79]) << 38;
+        n >>= 53; n += ((sp_int128)a[i + 79]) << 38;
         r[i + 1] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 80]) << 38;
+        n >>= 53; n += ((sp_int128)a[i + 80]) << 38;
         r[i + 2] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 81]) << 38;
+        n >>= 53; n += ((sp_int128)a[i + 81]) << 38;
         r[i + 3] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 82]) << 38;
+        n >>= 53; n += ((sp_int128)a[i + 82]) << 38;
         r[i + 4] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 83]) << 38;
+        n >>= 53; n += ((sp_int128)a[i + 83]) << 38;
         r[i + 5] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 84]) << 38;
+        n >>= 53; n += ((sp_int128)a[i + 84]) << 38;
         r[i + 6] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 85]) << 38;
+        n >>= 53; n += ((sp_int128)a[i + 85]) << 38;
         r[i + 7] = n & 0x1fffffffffffffL;
-        n >>= 53; n += ((int128_t)a[i + 86]) << 38;
+        n >>= 53; n += ((sp_int128)a[i + 86]) << 38;
     }
-    r[72] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[151]) << 38;
-    r[73] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[152]) << 38;
-    r[74] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[153]) << 38;
-    r[75] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[154]) << 38;
-    r[76] = n & 0x1fffffffffffffL; n >>= 53; n += ((int128_t)a[155]) << 38;
+    r[72] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[151]) << 38;
+    r[73] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[152]) << 38;
+    r[74] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[153]) << 38;
+    r[75] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[154]) << 38;
+    r[76] = n & 0x1fffffffffffffL; n >>= 53; n += ((sp_int128)a[155]) << 38;
     r[77] = (sp_digit)n;
-#endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[78], 0, sizeof(*r) * 78U);
 }
 
@@ -10737,7 +19603,7 @@ static void sp_4096_mont_reduce_78(sp_digit* a, const sp_digit* m, sp_digit mp)
     a[i] &= 0x1fffffffffffffL;
 #endif
     sp_4096_mont_shift_78(a, a);
-    sp_4096_cond_sub_78(a, a, m, 0 - (((a[77] >> 15) > 0) ?
+    sp_4096_cond_sub_78(a, a, m, 0 - (((a[77] - m[77]) > 0) ?
             (sp_digit)1 : (sp_digit)0));
     sp_4096_norm_78(a);
 }
@@ -10781,22 +19647,10 @@ static void sp_4096_mont_sqr_78(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_4096_mul_d_156(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
-    int i;
-
-    for (i = 0; i < 156; i++) {
-        t += tb * a[i];
-        r[i] = (sp_digit)(t & 0x1fffffffffffffL);
-        t >>= 53;
-    }
-    r[156] = (sp_digit)t;
-#else
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     sp_digit t2;
-    int128_t p[4];
+    sp_int128 p[4];
     int i;
 
     for (i = 0; i < 156; i += 4) {
@@ -10822,7 +19676,6 @@ SP_NOINLINE static void sp_4096_mul_d_156(sp_digit* r, const sp_digit* a,
         r[i + 3] = (sp_digit)t2;
     }
     r[156] = (sp_digit)(t & 0x1fffffffffffffL);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Conditionally add a and b using the mask m.
@@ -10836,13 +19689,6 @@ SP_NOINLINE static void sp_4096_mul_d_156(sp_digit* r, const sp_digit* a,
 static void sp_4096_cond_add_78(sp_digit* r, const sp_digit* a,
         const sp_digit* b, const sp_digit m)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    for (i = 0; i < 78; i++) {
-        r[i] = a[i] + (b[i] & m);
-    }
-#else
     int i;
 
     for (i = 0; i < 72; i += 8) {
@@ -10861,7 +19707,6 @@ static void sp_4096_cond_add_78(sp_digit* r, const sp_digit* a,
     r[75] = a[75] + (b[75] & m);
     r[76] = a[76] + (b[76] & m);
     r[77] = a[77] + (b[77] & m);
-#endif /* WOLFSSL_SP_SMALL */
 }
 
 SP_NOINLINE static void sp_4096_rshift_78(sp_digit* r, const sp_digit* a,
@@ -10869,11 +19714,6 @@ SP_NOINLINE static void sp_4096_rshift_78(sp_digit* r, const sp_digit* a,
 {
     int i;
 
-#ifdef WOLFSSL_SP_SMALL
-    for (i=0; i<77; i++) {
-        r[i] = ((a[i] >> n) | (a[i + 1] << (53 - n))) & 0x1fffffffffffffL;
-    }
-#else
     for (i=0; i<72; i += 8) {
         r[i+0] = (a[i+0] >> n) | ((a[i+1] << (53 - n)) & 0x1fffffffffffffL);
         r[i+1] = (a[i+1] >> n) | ((a[i+2] << (53 - n)) & 0x1fffffffffffffL);
@@ -10889,7 +19729,6 @@ SP_NOINLINE static void sp_4096_rshift_78(sp_digit* r, const sp_digit* a,
     r[74] = (a[74] >> n) | ((a[75] << (53 - n)) & 0x1fffffffffffffL);
     r[75] = (a[75] >> n) | ((a[76] << (53 - n)) & 0x1fffffffffffffL);
     r[76] = (a[76] >> n) | ((a[77] << (53 - n)) & 0x1fffffffffffffL);
-#endif
     r[77] = a[77] >> n;
 }
 
@@ -10966,7 +19805,7 @@ static int sp_4096_div_78(const sp_digit* a, const sp_digit* d,
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
+    sp_int128 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
@@ -11015,7 +19854,14 @@ static int sp_4096_div_78(const sp_digit* a, const sp_digit* d,
             t1[78 + i] -= t2[78];
             t1[78 + i] += t1[78 + i - 1] >> 53;
             t1[78 + i - 1] &= 0x1fffffffffffffL;
-            r1 = (((-t1[78 + i]) << 53) - t1[78 + i - 1]) / dv;
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[78 + i];
+            d1 <<= 53;
+            d1 -= t1[78 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_4096_div_word_78(-t1[78 + i], -t1[78 + i - 1], dv);
+#endif
             r1 -= t1[78 + i];
             sp_4096_mul_d_78(t2, sd, r1);
             (void)sp_4096_add_78(&t1[i], &t1[i], t2);
@@ -11249,9 +20095,9 @@ static int sp_4096_mod_exp_78(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
-    sp_digit td[(32 * 156) + 156];
+    sp_digit td[(16 * 156) + 156];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm = NULL;
     sp_digit mp = 1;
@@ -11262,7 +20108,7 @@ static int sp_4096_mod_exp_78(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 156) + 156), NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 156) + 156), NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL)
         err = MEMORY_E;
@@ -11270,9 +20116,9 @@ static int sp_4096_mod_exp_78(sp_digit* r, const sp_digit* a, const sp_digit* e,
 
     if (err == MP_OKAY) {
         norm = td;
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 156;
-        rt = td + 4992;
+        rt = td + 2496;
 
         sp_4096_mont_setup(m, &mp);
         sp_4096_mont_norm_78(norm, m);
@@ -11305,24 +20151,8 @@ static int sp_4096_mod_exp_78(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_4096_mont_mul_78(t[13], t[ 7], t[ 6], m, mp);
         sp_4096_mont_sqr_78(t[14], t[ 7], m, mp);
         sp_4096_mont_mul_78(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_78(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_78(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_78(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_78(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_78(t[20], t[10], m, mp);
-        sp_4096_mont_mul_78(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_78(t[22], t[11], m, mp);
-        sp_4096_mont_mul_78(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_78(t[24], t[12], m, mp);
-        sp_4096_mont_mul_78(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_78(t[26], t[13], m, mp);
-        sp_4096_mont_mul_78(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_78(t[28], t[14], m, mp);
-        sp_4096_mont_mul_78(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_78(t[30], t[15], m, mp);
-        sp_4096_mont_mul_78(t[31], t[16], t[15], m, mp);
 
-        bits = ((bits + 4) / 5) * 5;
+        bits = ((bits + 3) / 4) * 4;
         i = ((bits + 52) / 53) - 1;
         c = bits % 53;
         if (c == 0) {
@@ -11335,28 +20165,39 @@ static int sp_4096_mod_exp_78(sp_digit* r, const sp_digit* a, const sp_digit* e,
             n = 0;
             i--;
         }
-        if (c < 5) {
+        if (c < 4) {
             n |= e[i--] << (11 - c);
             c += 53;
         }
-        y = (int)((n >> 59) & 0x1f);
-        n <<= 5;
-        c -= 5;
+        y = (int)((n >> 60) & 0xf);
+        n <<= 4;
+        c -= 4;
         XMEMCPY(rt, t[y], sizeof(sp_digit) * 156);
-        while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (11 - c);
-                c += 53;
+        while ((i >= 0) || (c >= 4)) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
+            }
+            else if (c == 0) {
+                n = e[i--] << 11;
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c = 49;
+            }
+            else {
+                y = (byte)((n >> 60) & 0xf);
+                n = e[i--] << 11;
+                c = 4 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 53 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_4096_mont_sqr_78(rt, rt, m, mp);
             sp_4096_mont_sqr_78(rt, rt, m, mp);
             sp_4096_mont_sqr_78(rt, rt, m, mp);
             sp_4096_mont_sqr_78(rt, rt, m, mp);
-            sp_4096_mont_sqr_78(rt, rt, m, mp);
 
             sp_4096_mont_mul_78(rt, rt, t[y], m, mp);
         }
@@ -11486,7 +20327,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
         sp_4096_cond_sub_78(r, r, m, ((mp < 0) ?
                     (sp_digit)1 : (sp_digit)0)- 1);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_78(r, out);
         *outLen = 512;
     }
 
@@ -11599,7 +20440,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_78(r, out);
         *outLen = 512;
     }
 
@@ -11693,7 +20534,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_78(r, out);
         *outLen = 512;
     }
 
@@ -11766,7 +20607,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_78(r, out);
         *outLen = 512;
     }
 
@@ -11789,10 +20630,9 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* a = NULL;
 #else
-    sp_digit a[39 * 11];
+    sp_digit a[39 * 8];
 #endif
     sp_digit* p = NULL;
-    sp_digit* q = NULL;
     sp_digit* dp = NULL;
     sp_digit* dq = NULL;
     sp_digit* qi = NULL;
@@ -11821,31 +20661,31 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 39 * 11, NULL,
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 39 * 8, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
     }
 #endif
     if (err == MP_OKAY) {
-        p = a + 78 * 2;
-        q = p + 39;
-        qi = dq = dp = q + 39;
+        p = a + 78;
+        qi = dq = dp = p + 39;
         tmpa = qi + 39;
         tmpb = tmpa + 78;
-        r = a + 78;
+        r = a;
 
         sp_4096_from_bin(a, 78, in, inLen);
         sp_4096_from_mp(p, 39, pm);
-        sp_4096_from_mp(q, 39, qm);
         sp_4096_from_mp(dp, 39, dpm);
         err = sp_4096_mod_exp_39(tmpa, a, dp, 2048, p, 1);
     }
     if (err == MP_OKAY) {
+        sp_4096_from_mp(p, 39, qm);
         sp_4096_from_mp(dq, 39, dqm);
-        err = sp_4096_mod_exp_39(tmpb, a, dq, 2048, q, 1);
+        err = sp_4096_mod_exp_39(tmpb, a, dq, 2048, p, 1);
     }
     if (err == MP_OKAY) {
+        sp_4096_from_mp(p, 39, pm);
         (void)sp_4096_sub_39(tmpa, tmpa, tmpb);
         sp_4096_norm_39(tmpa);
         sp_4096_cond_add_39(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[38] >> 63));
@@ -11857,11 +20697,12 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_mul_39(tmpa, q, tmpa);
+        sp_4096_from_mp(p, 39, qm);
+        sp_4096_mul_39(tmpa, p, tmpa);
         (void)sp_4096_add_78(r, tmpb, tmpa);
         sp_4096_norm_78(r);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_78(r, out);
         *outLen = 512;
     }
 
@@ -11869,7 +20710,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     if (a != NULL)
 #endif
     {
-        ForceZero(a, sizeof(sp_digit) * 39 * 11);
+        ForceZero(a, sizeof(sp_digit) * 39 * 8);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
         XFREE(a, NULL, DYNAMIC_TYPE_RSA);
 #endif
@@ -11912,7 +20753,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (err == MP_OKAY) {
-        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 39 * 13, NULL, 
+        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 39 * 13, NULL,
                                                               DYNAMIC_TYPE_RSA);
         if (a == NULL)
             err = MEMORY_E;
@@ -11956,7 +20797,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
         (void)sp_4096_add_78(r, tmpb, tmpa);
         sp_4096_norm_78(r);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_78(r, out);
         *outLen = 512;
     }
 
@@ -12193,14 +21034,6 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
 SP_NOINLINE static void sp_4096_lshift_78(sp_digit* r, const sp_digit* a,
         byte n)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-
-    r[78] = a[77] >> (53 - n);
-    for (i=77; i>0; i--) {
-        r[i] = ((a[i] << n) | (a[i-1] >> (53 - n))) & 0x1fffffffffffffL;
-    }
-#else
     sp_int_digit s;
     sp_int_digit t;
 
@@ -12360,7 +21193,6 @@ SP_NOINLINE static void sp_4096_lshift_78(sp_digit* r, const sp_digit* a,
     r[2] = ((s << n) | (t >> (53U - n))) & 0x1fffffffffffffUL;
     s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
     r[1] = ((s << n) | (t >> (53U - n))) & 0x1fffffffffffffUL;
-#endif
     r[0] = (a[0] << n) & 0x1fffffffffffffL;
 }
 
@@ -12426,13 +21258,25 @@ static int sp_4096_mod_exp_2_78(sp_digit* r, const sp_digit* e, int bits, const
         c -= 5;
         sp_4096_lshift_78(r, norm, (byte)y);
         while ((i >= 0) || (c >= 5)) {
-            if (c < 5) {
-                n |= e[i--] << (11 - c);
-                c += 53;
+            if (c >= 5) {
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c -= 5;
+            }
+            else if (c == 0) {
+                n = e[i--] << 11;
+                y = (byte)((n >> 59) & 0x1f);
+                n <<= 5;
+                c = 48;
+            }
+            else {
+                y = (byte)((n >> 59) & 0x1f);
+                n = e[i--] << 11;
+                c = 5 - c;
+                y |= (byte)((n >> (64 - c)) & ((1 << c) - 1));
+                n <<= c;
+                c = 53 - c;
             }
-            y = (int)((n >> 59) & 0x1f);
-            n <<= 5;
-            c -= 5;
 
             sp_4096_mont_sqr_78(r, r, m, mp);
             sp_4096_mont_sqr_78(r, r, m, mp);
@@ -12482,80 +21326,6 @@ static int sp_4096_mod_exp_2_78(sp_digit* r, const sp_digit* e, int bits, const
 int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     const mp_int* mod, byte* out, word32* outLen)
 {
-#ifdef WOLFSSL_SP_SMALL
-    int err = MP_OKAY;
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* b = NULL;
-#else
-    sp_digit b[78 * 4];
-#endif
-    sp_digit* e = NULL;
-    sp_digit* m = NULL;
-    sp_digit* r = NULL;
-    word32 i;
-
-    if (mp_count_bits(base) > 4096) {
-        err = MP_READ_E;
-    }
-    else if (expLen > 512) {
-        err = MP_READ_E;
-    }
-    else if (mp_count_bits(mod) != 4096) {
-        err = MP_READ_E;
-    }
-    else if (mp_iseven(mod)) {
-        err = MP_VAL;
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (err == MP_OKAY) {
-        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 78 * 4, NULL, DYNAMIC_TYPE_DH);
-        if (b == NULL)
-            err = MEMORY_E;
-    }
-#endif
-
-    if (err == MP_OKAY) {
-        e = b + 78 * 2;
-        m = e + 78;
-        r = b;
-
-        sp_4096_from_mp(b, 78, base);
-        sp_4096_from_bin(e, 78, exp, expLen);
-        sp_4096_from_mp(m, 78, mod);
-
-    #ifdef HAVE_FFDHE_4096
-        if (base->used == 1 && base->dp[0] == 2 &&
-                ((m[77] << 17) | (m[76] >> 36)) == 0xffffffffL) {
-            err = sp_4096_mod_exp_2_78(r, e, expLen * 8, m);
-        }
-        else
-    #endif
-            err = sp_4096_mod_exp_78(r, b, e, expLen * 8, m, 0);
-    }
-
-    if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
-        *outLen = 512;
-        for (i=0; i<512 && out[i] == 0; i++) {
-        }
-        *outLen -= i;
-        XMEMMOVE(out, out + i, *outLen);
-    }
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (b != NULL)
-#endif
-    {
-        /* only "e" is sensitive and needs zeroized */
-        if (e != NULL)
-            ForceZero(e, sizeof(sp_digit) * 78U);
-    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        XFREE(b, NULL, DYNAMIC_TYPE_DH);
-    #endif
-    }
-    return err;
-#else
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* b = NULL;
 #else
@@ -12612,7 +21382,7 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_78(r, out);
         *outLen = 512;
         for (i=0; i<512U && out[i] == 0U; i++) {
             /* Search for first non-zero. */
@@ -12634,12 +21404,12 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     return err;
-#endif
 }
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */
 
+#endif /* WOLFSSL_SP_SMALL */
 #endif /* WOLFSSL_SP_4096 */
 
 #endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */
@@ -12735,30 +21505,33 @@ SP_NOINLINE static void sp_256_mul_5(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int128_t c;
+    sp_uint128 c;
+    sp_uint128 lo;
 
-    c = ((int128_t)a[4]) * b[4];
+    c = ((sp_uint128)a[4]) * b[4];
     r[9] = (sp_digit)(c >> 52);
-    c = (c & 0xfffffffffffffL) << 52;
+    c &= 0xfffffffffffffL;
     for (k = 7; k >= 0; k--) {
-        for (i = 4; i >= 0; i--) {
-            j = k - i;
-            if (j >= 5) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
+        if (k >= 5) {
+            i = k - 4;
+            imax = 4;
         }
-        r[k + 2] += (sp_digit)(c >> 104);
-        r[k + 1] = (sp_digit)((c >> 52) & 0xfffffffffffffL);
-        c = (c & 0xfffffffffffffL) << 52;
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint128)a[i]) * b[k - i];
+        }
+        c += lo >> 52;
+        r[k + 2] += (sp_digit)(c >> 52);
+        r[k + 1]  = (sp_digit)(c & 0xfffffffffffffL);
+        c = lo & 0xfffffffffffffL;
     }
-    r[0] = (sp_digit)(c >> 52);
+    r[0] = (sp_digit)c;
 }
 
 #else
@@ -12771,31 +21544,31 @@ SP_NOINLINE static void sp_256_mul_5(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_256_mul_5(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
-    int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
-                 + ((int128_t)a[ 1]) * b[ 0];
-    int128_t t2   = ((int128_t)a[ 0]) * b[ 2]
-                 + ((int128_t)a[ 1]) * b[ 1]
-                 + ((int128_t)a[ 2]) * b[ 0];
-    int128_t t3   = ((int128_t)a[ 0]) * b[ 3]
-                 + ((int128_t)a[ 1]) * b[ 2]
-                 + ((int128_t)a[ 2]) * b[ 1]
-                 + ((int128_t)a[ 3]) * b[ 0];
-    int128_t t4   = ((int128_t)a[ 0]) * b[ 4]
-                 + ((int128_t)a[ 1]) * b[ 3]
-                 + ((int128_t)a[ 2]) * b[ 2]
-                 + ((int128_t)a[ 3]) * b[ 1]
-                 + ((int128_t)a[ 4]) * b[ 0];
-    int128_t t5   = ((int128_t)a[ 1]) * b[ 4]
-                 + ((int128_t)a[ 2]) * b[ 3]
-                 + ((int128_t)a[ 3]) * b[ 2]
-                 + ((int128_t)a[ 4]) * b[ 1];
-    int128_t t6   = ((int128_t)a[ 2]) * b[ 4]
-                 + ((int128_t)a[ 3]) * b[ 3]
-                 + ((int128_t)a[ 4]) * b[ 2];
-    int128_t t7   = ((int128_t)a[ 3]) * b[ 4]
-                 + ((int128_t)a[ 4]) * b[ 3];
-    int128_t t8   = ((int128_t)a[ 4]) * b[ 4];
+    sp_int128 t0   = ((sp_int128)a[ 0]) * b[ 0];
+    sp_int128 t1   = ((sp_int128)a[ 0]) * b[ 1]
+                 + ((sp_int128)a[ 1]) * b[ 0];
+    sp_int128 t2   = ((sp_int128)a[ 0]) * b[ 2]
+                 + ((sp_int128)a[ 1]) * b[ 1]
+                 + ((sp_int128)a[ 2]) * b[ 0];
+    sp_int128 t3   = ((sp_int128)a[ 0]) * b[ 3]
+                 + ((sp_int128)a[ 1]) * b[ 2]
+                 + ((sp_int128)a[ 2]) * b[ 1]
+                 + ((sp_int128)a[ 3]) * b[ 0];
+    sp_int128 t4   = ((sp_int128)a[ 0]) * b[ 4]
+                 + ((sp_int128)a[ 1]) * b[ 3]
+                 + ((sp_int128)a[ 2]) * b[ 2]
+                 + ((sp_int128)a[ 3]) * b[ 1]
+                 + ((sp_int128)a[ 4]) * b[ 0];
+    sp_int128 t5   = ((sp_int128)a[ 1]) * b[ 4]
+                 + ((sp_int128)a[ 2]) * b[ 3]
+                 + ((sp_int128)a[ 3]) * b[ 2]
+                 + ((sp_int128)a[ 4]) * b[ 1];
+    sp_int128 t6   = ((sp_int128)a[ 2]) * b[ 4]
+                 + ((sp_int128)a[ 3]) * b[ 3]
+                 + ((sp_int128)a[ 4]) * b[ 2];
+    sp_int128 t7   = ((sp_int128)a[ 3]) * b[ 4]
+                 + ((sp_int128)a[ 4]) * b[ 3];
+    sp_int128 t8   = ((sp_int128)a[ 4]) * b[ 4];
 
     t1   += t0  >> 52; r[ 0] = t0  & 0xfffffffffffffL;
     t2   += t1  >> 52; r[ 1] = t1  & 0xfffffffffffffL;
@@ -12819,31 +21592,34 @@ SP_NOINLINE static void sp_256_mul_5(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_256_sqr_5(sp_digit* r, const sp_digit* a)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int128_t c;
+    sp_uint128 c;
+    sp_uint128 t;
 
-    c = ((int128_t)a[4]) * a[4];
+    c = ((sp_uint128)a[4]) * a[4];
     r[9] = (sp_digit)(c >> 52);
     c = (c & 0xfffffffffffffL) << 52;
     for (k = 7; k >= 0; k--) {
-        for (i = 4; i >= 0; i--) {
-            j = k - i;
-            if (j >= 5 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
         }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
+        if (k < 4) {
+            imax = k;
         }
+        else {
+            imax = 4;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint128)a[i]) * a[k - i];
+        }
+        c += t * 2;
 
-        r[k + 2] += (sp_digit)(c >> 104);
-        r[k + 1] = (sp_digit)((c >> 52) & 0xfffffffffffffL);
+        r[k + 2] += (sp_digit) (c >> 104);
+        r[k + 1]  = (sp_digit)((c >> 52) & 0xfffffffffffffL);
         c = (c & 0xfffffffffffffL) << 52;
     }
     r[0] = (sp_digit)(c >> 52);
@@ -12857,21 +21633,21 @@ SP_NOINLINE static void sp_256_sqr_5(sp_digit* r, const sp_digit* a)
  */
 SP_NOINLINE static void sp_256_sqr_5(sp_digit* r, const sp_digit* a)
 {
-    int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
-    int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
-    int128_t t2   = (((int128_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int128_t)a[ 1]) * a[ 1];
-    int128_t t3   = (((int128_t)a[ 0]) * a[ 3]
-                 +  ((int128_t)a[ 1]) * a[ 2]) * 2;
-    int128_t t4   = (((int128_t)a[ 0]) * a[ 4]
-                 +  ((int128_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int128_t)a[ 2]) * a[ 2];
-    int128_t t5   = (((int128_t)a[ 1]) * a[ 4]
-                 +  ((int128_t)a[ 2]) * a[ 3]) * 2;
-    int128_t t6   = (((int128_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int128_t)a[ 3]) * a[ 3];
-    int128_t t7   = (((int128_t)a[ 3]) * a[ 4]) * 2;
-    int128_t t8   =  ((int128_t)a[ 4]) * a[ 4];
+    sp_int128 t0   =  ((sp_int128)a[ 0]) * a[ 0];
+    sp_int128 t1   = (((sp_int128)a[ 0]) * a[ 1]) * 2;
+    sp_int128 t2   = (((sp_int128)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_int128)a[ 1]) * a[ 1];
+    sp_int128 t3   = (((sp_int128)a[ 0]) * a[ 3]
+                 +  ((sp_int128)a[ 1]) * a[ 2]) * 2;
+    sp_int128 t4   = (((sp_int128)a[ 0]) * a[ 4]
+                 +  ((sp_int128)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_int128)a[ 2]) * a[ 2];
+    sp_int128 t5   = (((sp_int128)a[ 1]) * a[ 4]
+                 +  ((sp_int128)a[ 2]) * a[ 3]) * 2;
+    sp_int128 t6   = (((sp_int128)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_int128)a[ 3]) * a[ 3];
+    sp_int128 t7   = (((sp_int128)a[ 3]) * a[ 4]) * 2;
+    sp_int128 t8   =  ((sp_int128)a[ 4]) * a[ 4];
 
     t1   += t0  >> 52; r[ 0] = t0  & 0xfffffffffffffL;
     t2   += t1  >> 52; r[ 1] = t1  & 0xfffffffffffffL;
@@ -13220,19 +21996,31 @@ SP_NOINLINE static void sp_256_mul_add_5(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t[4];
     int i;
 
-    for (i = 0; i < 5; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0xfffffffffffffL;
-        t >>= 52;
+    t[0] = 0;
+    for (i = 0; i < 4; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0xfffffffffffffL;
+        t[1] += t[0] >> 52;
+        r[i+1] = t[1] & 0xfffffffffffffL;
+        t[2] += t[1] >> 52;
+        r[i+2] = t[2] & 0xfffffffffffffL;
+        t[3] += t[2] >> 52;
+        r[i+3] = t[3] & 0xfffffffffffffL;
+        t[0]  = t[3] >> 52;
     }
-    r[5] += (sp_digit)t;
+    t[0] += (tb * a[4]) + r[4];
+    r[4] = t[0] & 0xfffffffffffffL;
+    r[5] +=  (sp_digit)(t[0] >> 52);
 #else
-    int128_t tb = b;
-    int128_t t[5];
+    sp_int128 tb = b;
+    sp_int128 t[5];
 
     t[ 0] = tb * a[ 0];
     t[ 1] = tb * a[ 1];
@@ -13248,7 +22036,7 @@ SP_NOINLINE static void sp_256_mul_add_5(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-/* Normalize the values in each word to 52.
+/* Normalize the values in each word to 52 bits.
  *
  * a  Array of sp_digit to normalize.
  */
@@ -13265,7 +22053,7 @@ static void sp_256_norm_5(sp_digit* a)
     a[2] += a[1] >> 52; a[1] &= 0xfffffffffffffL;
     a[3] += a[2] >> 52; a[2] &= 0xfffffffffffffL;
     a[4] += a[3] >> 52; a[3] &= 0xfffffffffffffL;
-#endif
+#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Shift the result in the high 256 bits down to the bottom.
@@ -13277,25 +22065,25 @@ static void sp_256_mont_shift_5(sp_digit* r, const sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
-    word64 n;
+    sp_uint64 n;
 
     n = a[4] >> 48;
     for (i = 0; i < 4; i++) {
-        n += (word64)a[5 + i] << 4;
+        n += (sp_uint64)a[5 + i] << 4;
         r[i] = n & 0xfffffffffffffL;
         n >>= 52;
     }
-    n += (word64)a[9] << 4;
+    n += (sp_uint64)a[9] << 4;
     r[4] = n;
 #else
-    word64 n;
+    sp_uint64 n;
 
     n  = a[4] >> 48;
-    n += (word64)a[ 5] << 4U; r[ 0] = n & 0xfffffffffffffUL; n >>= 52U;
-    n += (word64)a[ 6] << 4U; r[ 1] = n & 0xfffffffffffffUL; n >>= 52U;
-    n += (word64)a[ 7] << 4U; r[ 2] = n & 0xfffffffffffffUL; n >>= 52U;
-    n += (word64)a[ 8] << 4U; r[ 3] = n & 0xfffffffffffffUL; n >>= 52U;
-    n += (word64)a[ 9] << 4U; r[ 4] = n;
+    n += (sp_uint64)a[ 5] << 4U; r[ 0] = n & 0xfffffffffffffUL; n >>= 52U;
+    n += (sp_uint64)a[ 6] << 4U; r[ 1] = n & 0xfffffffffffffUL; n >>= 52U;
+    n += (sp_uint64)a[ 7] << 4U; r[ 2] = n & 0xfffffffffffffUL; n >>= 52U;
+    n += (sp_uint64)a[ 8] << 4U; r[ 3] = n & 0xfffffffffffffUL; n >>= 52U;
+    n += (sp_uint64)a[ 9] << 4U; r[ 4] = n;
 #endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[5], 0, sizeof(*r) * 5U);
 }
@@ -13477,7 +22265,7 @@ static void sp_256_map_5(sp_point_256* r, const sp_point_256* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*5;
-    int64_t n;
+    sp_int64 n;
 
     sp_256_mont_inv_5(t1, p->z, t + 2*5);
 
@@ -17303,7 +26091,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_256_to_bin(sp_digit* r, byte* a)
+static void sp_256_to_bin_5(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -17390,7 +26178,7 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_256_ecc_mulmod_5(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_256_to_bin(point->x, out);
+        sp_256_to_bin_5(point->x, out);
         *outLen = 32;
     }
 
@@ -17408,6 +26196,34 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
 #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
 #endif
 #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
+SP_NOINLINE static void sp_256_rshift_5(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+#ifdef WOLFSSL_SP_SMALL
+    for (i=0; i<4; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (52 - n))) & 0xfffffffffffffL;
+    }
+#else
+    for (i=0; i<0; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (52 - n)) & 0xfffffffffffffL);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (52 - n)) & 0xfffffffffffffL);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (52 - n)) & 0xfffffffffffffL);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (52 - n)) & 0xfffffffffffffL);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (52 - n)) & 0xfffffffffffffL);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (52 - n)) & 0xfffffffffffffL);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (52 - n)) & 0xfffffffffffffL);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (52 - n)) & 0xfffffffffffffL);
+    }
+    r[0] = (a[0] >> n) | ((a[1] << (52 - n)) & 0xfffffffffffffL);
+    r[1] = (a[1] >> n) | ((a[2] << (52 - n)) & 0xfffffffffffffL);
+    r[2] = (a[2] >> n) | ((a[3] << (52 - n)) & 0xfffffffffffffL);
+    r[3] = (a[3] >> n) | ((a[4] << (52 - n)) & 0xfffffffffffffL);
+#endif /* WOLFSSL_SP_SMALL */
+    r[4] = a[4] >> n;
+}
+
 /* Multiply a by scalar b into r. (r = a * b)
  *
  * r  A single precision integer.
@@ -17418,8 +26234,8 @@ SP_NOINLINE static void sp_256_mul_d_5(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     int i;
 
     for (i = 0; i < 5; i++) {
@@ -17429,8 +26245,8 @@ SP_NOINLINE static void sp_256_mul_d_5(sp_digit* r, const sp_digit* a,
     }
     r[5] = (sp_digit)t;
 #else
-    int128_t tb = b;
-    int128_t t[5];
+    sp_int128 tb = b;
+    sp_int128 t[5];
 
     t[ 0] = tb * a[ 0];
     t[ 1] = tb * a[ 1];
@@ -17446,59 +26262,48 @@ SP_NOINLINE static void sp_256_mul_d_5(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-#ifdef WOLFSSL_SP_DIV_64
-static WC_INLINE sp_digit sp_256_div_word_5(sp_digit d1, sp_digit d0,
-    sp_digit dv)
+SP_NOINLINE static void sp_256_lshift_10(sp_digit* r, const sp_digit* a,
+        byte n)
 {
-    sp_digit d;
-    sp_digit r;
-    sp_digit t;
+#ifdef WOLFSSL_SP_SMALL
+    int i;
 
-    /* All 52 bits from d1 and top 11 bits from d0. */
-    d = (d1 << 11) + (d0 >> 41);
-    r = d / dv;
-    d -= r * dv;
-    /* Up to 12 bits in r */
-    /* Next 11 bits from d0. */
-    r <<= 11;
-    d <<= 11;
-    d += (d0 >> 30) & ((1 << 11) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 23 bits in r */
-    /* Next 11 bits from d0. */
-    r <<= 11;
-    d <<= 11;
-    d += (d0 >> 19) & ((1 << 11) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 34 bits in r */
-    /* Next 11 bits from d0. */
-    r <<= 11;
-    d <<= 11;
-    d += (d0 >> 8) & ((1 << 11) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 45 bits in r */
-    /* Remaining 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += d0 & ((1 << 8) - 1);
-    t = d / dv;
-    r += t;
+    r[10] = a[9] >> (52 - n);
+    for (i=9; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (52 - n))) & 0xfffffffffffffL;
+    }
+#else
+    sp_int_digit s;
+    sp_int_digit t;
 
-    /* All 52 bits from d1 and top 11 bits from d0. */
-    return r;
+    s = (sp_int_digit)a[9];
+    r[10] = s >> (52U - n);
+    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
+    r[9] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
+    r[8] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
+    r[7] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
+    r[6] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
+    r[5] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
+    r[4] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
+    r[3] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
+    r[2] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
+    r[1] = ((s << n) | (t >> (52U - n))) & 0xfffffffffffffUL;
+#endif /* WOLFSSL_SP_SMALL */
+    r[0] = (a[0] << n) & 0xfffffffffffffL;
 }
-#endif /* WOLFSSL_SP_DIV_64 */
 
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Simplified based on top word of divisor being very large.
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -17506,75 +26311,60 @@ static WC_INLINE sp_digit sp_256_div_word_5(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_256_div_5(const sp_digit* a, const sp_digit* d, 
+static int sp_256_div_5(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
-#ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
-#endif
-    sp_digit dv;
     sp_digit r1;
+    sp_digit mask;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 5 + 1];
+    sp_digit t1[4 * 5 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 5 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 5 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
+    (void)m;
+
     if (err == MP_OKAY) {
-        t2 = t1 + 2 * 5;
+        t2 = t1 + 10 + 1;
+        sd = t2 + 5 + 1;
 
-        dv = d[4];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 5U);
+        sp_256_mul_d_5(sd, d, (sp_digit)1 << 4);
+        sp_256_lshift_10(t1, a, 4);
+        t1[5 + 5] += t1[5 + 5 - 1] >> 52;
+        t1[5 + 5 - 1] &= 0xfffffffffffffL;
         for (i=4; i>=0; i--) {
-            t1[5 + i] += t1[5 + i - 1] >> 52;
-            t1[5 + i - 1] &= 0xfffffffffffffL;
-#ifndef WOLFSSL_SP_DIV_64
-            d1 = t1[5 + i];
-            d1 <<= 52;
-            d1 += t1[5 + i - 1];
-            r1 = (sp_digit)(d1 / dv);
-#else
-            r1 = sp_256_div_word_5(t1[5 + i], t1[5 + i - 1], dv);
-#endif
-
-            sp_256_mul_d_5(t2, d, r1);
+            r1 = t1[5 + i];
+            sp_256_mul_d_5(t2, sd, r1);
             (void)sp_256_sub_5(&t1[i], &t1[i], t2);
-            sp_256_norm_5(&t1[i]);
             t1[5 + i] -= t2[5];
-            t1[5 + i] += t1[5 + i - 1] >> 52;
-            t1[5 + i - 1] &= 0xfffffffffffffL;
-            r1 = (((-t1[5 + i]) << 52) - t1[5 + i - 1]) / dv;
-            r1++;
-            sp_256_mul_d_5(t2, d, r1);
-            (void)sp_256_add_5(&t1[i], &t1[i], t2);
-            t1[5 + i] += t1[5 + i - 1] >> 52;
-            t1[5 + i - 1] &= 0xfffffffffffffL;
-        }
-        t1[5 - 1] += t1[5 - 2] >> 52;
-        t1[5 - 2] &= 0xfffffffffffffL;
-        r1 = t1[5 - 1] / dv;
+            sp_256_norm_5(&t1[i + 1]);
 
-        sp_256_mul_d_5(t2, d, r1);
-        (void)sp_256_sub_5(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 10U);
-        for (i=0; i<4; i++) {
-            r[i+1] += r[i] >> 52;
-            r[i] &= 0xfffffffffffffL;
+            r1 = t1[5 + i];
+            sp_256_mul_d_5(t2, sd, r1);
+            (void)sp_256_sub_5(&t1[i], &t1[i], t2);
+            t1[5 + i] -= t2[5];
+            sp_256_norm_5(&t1[i + 1]);
+
+            mask = (sp_digit)0 - ((t1[5 + i] > 0) ?
+                    (sp_digit)1 : (sp_digit)0);
+            sp_256_cond_sub_5(t1 + i, t1 + i, sd, mask);
+            sp_256_norm_5(&t1[i + 1]);
         }
-        sp_256_cond_add_5(r, r, d, 0 - ((r[4] < 0) ?
-                    (sp_digit)1 : (sp_digit)0));
+        sp_256_norm_5(t1);
+        sp_256_rshift_5(r, t1, 4);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -17817,7 +26607,7 @@ static int sp_256_calc_s_5(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int64_t c;
+    sp_int64 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -17929,7 +26719,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int64_t c;
+        sp_int64 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 5U);
         sp_256_norm_5(ctx->r);
@@ -17978,7 +26768,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int64_t c;
+        sp_int64 c;
         sp_256_norm_5(ctx->x);
         carry = sp_256_add_5(ctx->s, ctx->e, ctx->x);
         sp_256_cond_sub_5(ctx->s, ctx->s,
@@ -18048,7 +26838,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int64_t c;
+    sp_int64 c;
     int err = MP_OKAY;
     int i;
 
@@ -18527,7 +27317,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int64_t c = 0;
+        sp_int64 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_256_cmp_5(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -18582,7 +27372,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_256* p2 = NULL;
     sp_digit carry;
-    int64_t c = 0;
+    sp_int64 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -19314,30 +28104,33 @@ SP_NOINLINE static void sp_384_mul_7(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int128_t c;
+    sp_uint128 c;
+    sp_uint128 lo;
 
-    c = ((int128_t)a[6]) * b[6];
+    c = ((sp_uint128)a[6]) * b[6];
     r[13] = (sp_digit)(c >> 55);
-    c = (c & 0x7fffffffffffffL) << 55;
+    c &= 0x7fffffffffffffL;
     for (k = 11; k >= 0; k--) {
-        for (i = 6; i >= 0; i--) {
-            j = k - i;
-            if (j >= 7) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
+        if (k >= 7) {
+            i = k - 6;
+            imax = 6;
         }
-        r[k + 2] += (sp_digit)(c >> 110);
-        r[k + 1] = (sp_digit)((c >> 55) & 0x7fffffffffffffL);
-        c = (c & 0x7fffffffffffffL) << 55;
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint128)a[i]) * b[k - i];
+        }
+        c += lo >> 55;
+        r[k + 2] += (sp_digit)(c >> 55);
+        r[k + 1]  = (sp_digit)(c & 0x7fffffffffffffL);
+        c = lo & 0x7fffffffffffffL;
     }
-    r[0] = (sp_digit)(c >> 55);
+    r[0] = (sp_digit)c;
 }
 
 #else
@@ -19350,55 +28143,55 @@ SP_NOINLINE static void sp_384_mul_7(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_384_mul_7(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
-    int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
-                 + ((int128_t)a[ 1]) * b[ 0];
-    int128_t t2   = ((int128_t)a[ 0]) * b[ 2]
-                 + ((int128_t)a[ 1]) * b[ 1]
-                 + ((int128_t)a[ 2]) * b[ 0];
-    int128_t t3   = ((int128_t)a[ 0]) * b[ 3]
-                 + ((int128_t)a[ 1]) * b[ 2]
-                 + ((int128_t)a[ 2]) * b[ 1]
-                 + ((int128_t)a[ 3]) * b[ 0];
-    int128_t t4   = ((int128_t)a[ 0]) * b[ 4]
-                 + ((int128_t)a[ 1]) * b[ 3]
-                 + ((int128_t)a[ 2]) * b[ 2]
-                 + ((int128_t)a[ 3]) * b[ 1]
-                 + ((int128_t)a[ 4]) * b[ 0];
-    int128_t t5   = ((int128_t)a[ 0]) * b[ 5]
-                 + ((int128_t)a[ 1]) * b[ 4]
-                 + ((int128_t)a[ 2]) * b[ 3]
-                 + ((int128_t)a[ 3]) * b[ 2]
-                 + ((int128_t)a[ 4]) * b[ 1]
-                 + ((int128_t)a[ 5]) * b[ 0];
-    int128_t t6   = ((int128_t)a[ 0]) * b[ 6]
-                 + ((int128_t)a[ 1]) * b[ 5]
-                 + ((int128_t)a[ 2]) * b[ 4]
-                 + ((int128_t)a[ 3]) * b[ 3]
-                 + ((int128_t)a[ 4]) * b[ 2]
-                 + ((int128_t)a[ 5]) * b[ 1]
-                 + ((int128_t)a[ 6]) * b[ 0];
-    int128_t t7   = ((int128_t)a[ 1]) * b[ 6]
-                 + ((int128_t)a[ 2]) * b[ 5]
-                 + ((int128_t)a[ 3]) * b[ 4]
-                 + ((int128_t)a[ 4]) * b[ 3]
-                 + ((int128_t)a[ 5]) * b[ 2]
-                 + ((int128_t)a[ 6]) * b[ 1];
-    int128_t t8   = ((int128_t)a[ 2]) * b[ 6]
-                 + ((int128_t)a[ 3]) * b[ 5]
-                 + ((int128_t)a[ 4]) * b[ 4]
-                 + ((int128_t)a[ 5]) * b[ 3]
-                 + ((int128_t)a[ 6]) * b[ 2];
-    int128_t t9   = ((int128_t)a[ 3]) * b[ 6]
-                 + ((int128_t)a[ 4]) * b[ 5]
-                 + ((int128_t)a[ 5]) * b[ 4]
-                 + ((int128_t)a[ 6]) * b[ 3];
-    int128_t t10  = ((int128_t)a[ 4]) * b[ 6]
-                 + ((int128_t)a[ 5]) * b[ 5]
-                 + ((int128_t)a[ 6]) * b[ 4];
-    int128_t t11  = ((int128_t)a[ 5]) * b[ 6]
-                 + ((int128_t)a[ 6]) * b[ 5];
-    int128_t t12  = ((int128_t)a[ 6]) * b[ 6];
+    sp_int128 t0   = ((sp_int128)a[ 0]) * b[ 0];
+    sp_int128 t1   = ((sp_int128)a[ 0]) * b[ 1]
+                 + ((sp_int128)a[ 1]) * b[ 0];
+    sp_int128 t2   = ((sp_int128)a[ 0]) * b[ 2]
+                 + ((sp_int128)a[ 1]) * b[ 1]
+                 + ((sp_int128)a[ 2]) * b[ 0];
+    sp_int128 t3   = ((sp_int128)a[ 0]) * b[ 3]
+                 + ((sp_int128)a[ 1]) * b[ 2]
+                 + ((sp_int128)a[ 2]) * b[ 1]
+                 + ((sp_int128)a[ 3]) * b[ 0];
+    sp_int128 t4   = ((sp_int128)a[ 0]) * b[ 4]
+                 + ((sp_int128)a[ 1]) * b[ 3]
+                 + ((sp_int128)a[ 2]) * b[ 2]
+                 + ((sp_int128)a[ 3]) * b[ 1]
+                 + ((sp_int128)a[ 4]) * b[ 0];
+    sp_int128 t5   = ((sp_int128)a[ 0]) * b[ 5]
+                 + ((sp_int128)a[ 1]) * b[ 4]
+                 + ((sp_int128)a[ 2]) * b[ 3]
+                 + ((sp_int128)a[ 3]) * b[ 2]
+                 + ((sp_int128)a[ 4]) * b[ 1]
+                 + ((sp_int128)a[ 5]) * b[ 0];
+    sp_int128 t6   = ((sp_int128)a[ 0]) * b[ 6]
+                 + ((sp_int128)a[ 1]) * b[ 5]
+                 + ((sp_int128)a[ 2]) * b[ 4]
+                 + ((sp_int128)a[ 3]) * b[ 3]
+                 + ((sp_int128)a[ 4]) * b[ 2]
+                 + ((sp_int128)a[ 5]) * b[ 1]
+                 + ((sp_int128)a[ 6]) * b[ 0];
+    sp_int128 t7   = ((sp_int128)a[ 1]) * b[ 6]
+                 + ((sp_int128)a[ 2]) * b[ 5]
+                 + ((sp_int128)a[ 3]) * b[ 4]
+                 + ((sp_int128)a[ 4]) * b[ 3]
+                 + ((sp_int128)a[ 5]) * b[ 2]
+                 + ((sp_int128)a[ 6]) * b[ 1];
+    sp_int128 t8   = ((sp_int128)a[ 2]) * b[ 6]
+                 + ((sp_int128)a[ 3]) * b[ 5]
+                 + ((sp_int128)a[ 4]) * b[ 4]
+                 + ((sp_int128)a[ 5]) * b[ 3]
+                 + ((sp_int128)a[ 6]) * b[ 2];
+    sp_int128 t9   = ((sp_int128)a[ 3]) * b[ 6]
+                 + ((sp_int128)a[ 4]) * b[ 5]
+                 + ((sp_int128)a[ 5]) * b[ 4]
+                 + ((sp_int128)a[ 6]) * b[ 3];
+    sp_int128 t10  = ((sp_int128)a[ 4]) * b[ 6]
+                 + ((sp_int128)a[ 5]) * b[ 5]
+                 + ((sp_int128)a[ 6]) * b[ 4];
+    sp_int128 t11  = ((sp_int128)a[ 5]) * b[ 6]
+                 + ((sp_int128)a[ 6]) * b[ 5];
+    sp_int128 t12  = ((sp_int128)a[ 6]) * b[ 6];
 
     t1   += t0  >> 55; r[ 0] = t0  & 0x7fffffffffffffL;
     t2   += t1  >> 55; r[ 1] = t1  & 0x7fffffffffffffL;
@@ -19426,31 +28219,34 @@ SP_NOINLINE static void sp_384_mul_7(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_384_sqr_7(sp_digit* r, const sp_digit* a)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int128_t c;
+    sp_uint128 c;
+    sp_uint128 t;
 
-    c = ((int128_t)a[6]) * a[6];
+    c = ((sp_uint128)a[6]) * a[6];
     r[13] = (sp_digit)(c >> 55);
     c = (c & 0x7fffffffffffffL) << 55;
     for (k = 11; k >= 0; k--) {
-        for (i = 6; i >= 0; i--) {
-            j = k - i;
-            if (j >= 7 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
         }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
+        if (k < 6) {
+            imax = k;
         }
+        else {
+            imax = 6;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint128)a[i]) * a[k - i];
+        }
+        c += t * 2;
 
-        r[k + 2] += (sp_digit)(c >> 110);
-        r[k + 1] = (sp_digit)((c >> 55) & 0x7fffffffffffffL);
+        r[k + 2] += (sp_digit) (c >> 110);
+        r[k + 1]  = (sp_digit)((c >> 55) & 0x7fffffffffffffL);
         c = (c & 0x7fffffffffffffL) << 55;
     }
     r[0] = (sp_digit)(c >> 55);
@@ -19464,34 +28260,34 @@ SP_NOINLINE static void sp_384_sqr_7(sp_digit* r, const sp_digit* a)
  */
 SP_NOINLINE static void sp_384_sqr_7(sp_digit* r, const sp_digit* a)
 {
-    int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
-    int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
-    int128_t t2   = (((int128_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int128_t)a[ 1]) * a[ 1];
-    int128_t t3   = (((int128_t)a[ 0]) * a[ 3]
-                 +  ((int128_t)a[ 1]) * a[ 2]) * 2;
-    int128_t t4   = (((int128_t)a[ 0]) * a[ 4]
-                 +  ((int128_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int128_t)a[ 2]) * a[ 2];
-    int128_t t5   = (((int128_t)a[ 0]) * a[ 5]
-                 +  ((int128_t)a[ 1]) * a[ 4]
-                 +  ((int128_t)a[ 2]) * a[ 3]) * 2;
-    int128_t t6   = (((int128_t)a[ 0]) * a[ 6]
-                 +  ((int128_t)a[ 1]) * a[ 5]
-                 +  ((int128_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int128_t)a[ 3]) * a[ 3];
-    int128_t t7   = (((int128_t)a[ 1]) * a[ 6]
-                 +  ((int128_t)a[ 2]) * a[ 5]
-                 +  ((int128_t)a[ 3]) * a[ 4]) * 2;
-    int128_t t8   = (((int128_t)a[ 2]) * a[ 6]
-                 +  ((int128_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int128_t)a[ 4]) * a[ 4];
-    int128_t t9   = (((int128_t)a[ 3]) * a[ 6]
-                 +  ((int128_t)a[ 4]) * a[ 5]) * 2;
-    int128_t t10  = (((int128_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int128_t)a[ 5]) * a[ 5];
-    int128_t t11  = (((int128_t)a[ 5]) * a[ 6]) * 2;
-    int128_t t12  =  ((int128_t)a[ 6]) * a[ 6];
+    sp_int128 t0   =  ((sp_int128)a[ 0]) * a[ 0];
+    sp_int128 t1   = (((sp_int128)a[ 0]) * a[ 1]) * 2;
+    sp_int128 t2   = (((sp_int128)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_int128)a[ 1]) * a[ 1];
+    sp_int128 t3   = (((sp_int128)a[ 0]) * a[ 3]
+                 +  ((sp_int128)a[ 1]) * a[ 2]) * 2;
+    sp_int128 t4   = (((sp_int128)a[ 0]) * a[ 4]
+                 +  ((sp_int128)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_int128)a[ 2]) * a[ 2];
+    sp_int128 t5   = (((sp_int128)a[ 0]) * a[ 5]
+                 +  ((sp_int128)a[ 1]) * a[ 4]
+                 +  ((sp_int128)a[ 2]) * a[ 3]) * 2;
+    sp_int128 t6   = (((sp_int128)a[ 0]) * a[ 6]
+                 +  ((sp_int128)a[ 1]) * a[ 5]
+                 +  ((sp_int128)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_int128)a[ 3]) * a[ 3];
+    sp_int128 t7   = (((sp_int128)a[ 1]) * a[ 6]
+                 +  ((sp_int128)a[ 2]) * a[ 5]
+                 +  ((sp_int128)a[ 3]) * a[ 4]) * 2;
+    sp_int128 t8   = (((sp_int128)a[ 2]) * a[ 6]
+                 +  ((sp_int128)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_int128)a[ 4]) * a[ 4];
+    sp_int128 t9   = (((sp_int128)a[ 3]) * a[ 6]
+                 +  ((sp_int128)a[ 4]) * a[ 5]) * 2;
+    sp_int128 t10  = (((sp_int128)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_int128)a[ 5]) * a[ 5];
+    sp_int128 t11  = (((sp_int128)a[ 5]) * a[ 6]) * 2;
+    sp_int128 t12  =  ((sp_int128)a[ 6]) * a[ 6];
 
     t1   += t0  >> 55; r[ 0] = t0  & 0x7fffffffffffffL;
     t2   += t1  >> 55; r[ 1] = t1  & 0x7fffffffffffffL;
@@ -19852,19 +28648,37 @@ SP_NOINLINE static void sp_384_mul_add_7(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t[4];
     int i;
 
-    for (i = 0; i < 7; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x7fffffffffffffL;
-        t >>= 55;
+    t[0] = 0;
+    for (i = 0; i < 4; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x7fffffffffffffL;
+        t[1] += t[0] >> 55;
+        r[i+1] = t[1] & 0x7fffffffffffffL;
+        t[2] += t[1] >> 55;
+        r[i+2] = t[2] & 0x7fffffffffffffL;
+        t[3] += t[2] >> 55;
+        r[i+3] = t[3] & 0x7fffffffffffffL;
+        t[0]  = t[3] >> 55;
     }
-    r[7] += (sp_digit)t;
+    t[0] += (tb * a[4]) + r[4];
+    t[1]  = (tb * a[5]) + r[5];
+    t[2]  = (tb * a[6]) + r[6];
+    r[4] = t[0] & 0x7fffffffffffffL;
+    t[1] += t[0] >> 55;
+    r[5] = t[1] & 0x7fffffffffffffL;
+    t[2] += t[1] >> 55;
+    r[6] = t[2] & 0x7fffffffffffffL;
+    r[7] +=  (sp_digit)(t[2] >> 55);
 #else
-    int128_t tb = b;
-    int128_t t[7];
+    sp_int128 tb = b;
+    sp_int128 t[7];
 
     t[ 0] = tb * a[ 0];
     t[ 1] = tb * a[ 1];
@@ -19884,7 +28698,7 @@ SP_NOINLINE static void sp_384_mul_add_7(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-/* Normalize the values in each word to 55.
+/* Normalize the values in each word to 55 bits.
  *
  * a  Array of sp_digit to normalize.
  */
@@ -19903,7 +28717,7 @@ static void sp_384_norm_7(sp_digit* a)
     a[4] += a[3] >> 55; a[3] &= 0x7fffffffffffffL;
     a[5] += a[4] >> 55; a[4] &= 0x7fffffffffffffL;
     a[6] += a[5] >> 55; a[5] &= 0x7fffffffffffffL;
-#endif
+#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Shift the result in the high 384 bits down to the bottom.
@@ -19915,27 +28729,27 @@ static void sp_384_mont_shift_7(sp_digit* r, const sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
-    word64 n;
+    sp_uint64 n;
 
     n = a[6] >> 54;
     for (i = 0; i < 6; i++) {
-        n += (word64)a[7 + i] << 1;
+        n += (sp_uint64)a[7 + i] << 1;
         r[i] = n & 0x7fffffffffffffL;
         n >>= 55;
     }
-    n += (word64)a[13] << 1;
+    n += (sp_uint64)a[13] << 1;
     r[6] = n;
 #else
-    word64 n;
+    sp_uint64 n;
 
     n  = a[6] >> 54;
-    n += (word64)a[ 7] << 1U; r[ 0] = n & 0x7fffffffffffffUL; n >>= 55U;
-    n += (word64)a[ 8] << 1U; r[ 1] = n & 0x7fffffffffffffUL; n >>= 55U;
-    n += (word64)a[ 9] << 1U; r[ 2] = n & 0x7fffffffffffffUL; n >>= 55U;
-    n += (word64)a[10] << 1U; r[ 3] = n & 0x7fffffffffffffUL; n >>= 55U;
-    n += (word64)a[11] << 1U; r[ 4] = n & 0x7fffffffffffffUL; n >>= 55U;
-    n += (word64)a[12] << 1U; r[ 5] = n & 0x7fffffffffffffUL; n >>= 55U;
-    n += (word64)a[13] << 1U; r[ 6] = n;
+    n += (sp_uint64)a[ 7] << 1U; r[ 0] = n & 0x7fffffffffffffUL; n >>= 55U;
+    n += (sp_uint64)a[ 8] << 1U; r[ 1] = n & 0x7fffffffffffffUL; n >>= 55U;
+    n += (sp_uint64)a[ 9] << 1U; r[ 2] = n & 0x7fffffffffffffUL; n >>= 55U;
+    n += (sp_uint64)a[10] << 1U; r[ 3] = n & 0x7fffffffffffffUL; n >>= 55U;
+    n += (sp_uint64)a[11] << 1U; r[ 4] = n & 0x7fffffffffffffUL; n >>= 55U;
+    n += (sp_uint64)a[12] << 1U; r[ 5] = n & 0x7fffffffffffffUL; n >>= 55U;
+    n += (sp_uint64)a[13] << 1U; r[ 6] = n;
 #endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[7], 0, sizeof(*r) * 7U);
 }
@@ -20121,7 +28935,7 @@ static void sp_384_map_7(sp_point_384* r, const sp_point_384* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*7;
-    int64_t n;
+    sp_int64 n;
 
     sp_384_mont_inv_7(t1, p->z, t + 2*7);
 
@@ -24518,7 +33332,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_384_to_bin(sp_digit* r, byte* a)
+static void sp_384_to_bin_7(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -24605,7 +33419,7 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_384_ecc_mulmod_7(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_384_to_bin(point->x, out);
+        sp_384_to_bin_7(point->x, out);
         *outLen = 48;
     }
 
@@ -24623,6 +33437,36 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
 #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
 #endif
 #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
+SP_NOINLINE static void sp_384_rshift_7(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+#ifdef WOLFSSL_SP_SMALL
+    for (i=0; i<6; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (55 - n))) & 0x7fffffffffffffL;
+    }
+#else
+    for (i=0; i<0; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (55 - n)) & 0x7fffffffffffffL);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (55 - n)) & 0x7fffffffffffffL);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (55 - n)) & 0x7fffffffffffffL);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (55 - n)) & 0x7fffffffffffffL);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (55 - n)) & 0x7fffffffffffffL);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (55 - n)) & 0x7fffffffffffffL);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (55 - n)) & 0x7fffffffffffffL);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (55 - n)) & 0x7fffffffffffffL);
+    }
+    r[0] = (a[0] >> n) | ((a[1] << (55 - n)) & 0x7fffffffffffffL);
+    r[1] = (a[1] >> n) | ((a[2] << (55 - n)) & 0x7fffffffffffffL);
+    r[2] = (a[2] >> n) | ((a[3] << (55 - n)) & 0x7fffffffffffffL);
+    r[3] = (a[3] >> n) | ((a[4] << (55 - n)) & 0x7fffffffffffffL);
+    r[4] = (a[4] >> n) | ((a[5] << (55 - n)) & 0x7fffffffffffffL);
+    r[5] = (a[5] >> n) | ((a[6] << (55 - n)) & 0x7fffffffffffffL);
+#endif /* WOLFSSL_SP_SMALL */
+    r[6] = a[6] >> n;
+}
+
 /* Multiply a by scalar b into r. (r = a * b)
  *
  * r  A single precision integer.
@@ -24633,8 +33477,8 @@ SP_NOINLINE static void sp_384_mul_d_7(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     int i;
 
     for (i = 0; i < 7; i++) {
@@ -24644,8 +33488,8 @@ SP_NOINLINE static void sp_384_mul_d_7(sp_digit* r, const sp_digit* a,
     }
     r[7] = (sp_digit)t;
 #else
-    int128_t tb = b;
-    int128_t t[7];
+    sp_int128 tb = b;
+    sp_int128 t[7];
 
     t[ 0] = tb * a[ 0];
     t[ 1] = tb * a[ 1];
@@ -24665,75 +33509,56 @@ SP_NOINLINE static void sp_384_mul_d_7(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-#ifdef WOLFSSL_SP_DIV_64
-static WC_INLINE sp_digit sp_384_div_word_7(sp_digit d1, sp_digit d0,
-    sp_digit dv)
+SP_NOINLINE static void sp_384_lshift_14(sp_digit* r, const sp_digit* a,
+        byte n)
 {
-    sp_digit d;
-    sp_digit r;
-    sp_digit t;
+#ifdef WOLFSSL_SP_SMALL
+    int i;
 
-    /* All 55 bits from d1 and top 8 bits from d0. */
-    d = (d1 << 8) + (d0 >> 47);
-    r = d / dv;
-    d -= r * dv;
-    /* Up to 9 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 39) & ((1 << 8) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 17 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 31) & ((1 << 8) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 25 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 23) & ((1 << 8) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 33 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 15) & ((1 << 8) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 41 bits in r */
-    /* Next 8 bits from d0. */
-    r <<= 8;
-    d <<= 8;
-    d += (d0 >> 7) & ((1 << 8) - 1);
-    t = d / dv;
-    d -= t * dv;
-    r += t;
-    /* Up to 49 bits in r */
-    /* Remaining 7 bits from d0. */
-    r <<= 7;
-    d <<= 7;
-    d += d0 & ((1 << 7) - 1);
-    t = d / dv;
-    r += t;
+    r[14] = a[13] >> (55 - n);
+    for (i=13; i>0; i--) {
+        r[i] = ((a[i] << n) | (a[i-1] >> (55 - n))) & 0x7fffffffffffffL;
+    }
+#else
+    sp_int_digit s;
+    sp_int_digit t;
 
-    /* All 55 bits from d1 and top 8 bits from d0. */
-    return r;
+    s = (sp_int_digit)a[13];
+    r[14] = s >> (55U - n);
+    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
+    r[13] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
+    r[12] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
+    r[11] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
+    r[10] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
+    r[9] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
+    r[8] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
+    r[7] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
+    r[6] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
+    r[5] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
+    r[4] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
+    r[3] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
+    r[2] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
+    r[1] = ((s << n) | (t >> (55U - n))) & 0x7fffffffffffffUL;
+#endif /* WOLFSSL_SP_SMALL */
+    r[0] = (a[0] << n) & 0x7fffffffffffffL;
 }
-#endif /* WOLFSSL_SP_DIV_64 */
 
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Simplified based on top word of divisor being (1 << 55) - 1
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -24741,75 +33566,54 @@ static WC_INLINE sp_digit sp_384_div_word_7(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_384_div_7(const sp_digit* a, const sp_digit* d, 
+static int sp_384_div_7(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
-#ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
-#endif
-    sp_digit dv;
     sp_digit r1;
+    sp_digit mask;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 7 + 1];
+    sp_digit t1[4 * 7 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 7 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 7 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
+    (void)m;
+
     if (err == MP_OKAY) {
-        t2 = t1 + 2 * 7;
+        t2 = t1 + 14 + 1;
+        sd = t2 + 7 + 1;
 
-        dv = d[6];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 7U);
+        sp_384_mul_d_7(sd, d, (sp_digit)1 << 1);
+        sp_384_lshift_14(t1, a, 1);
+        t1[7 + 7] += t1[7 + 7 - 1] >> 55;
+        t1[7 + 7 - 1] &= 0x7fffffffffffffL;
         for (i=6; i>=0; i--) {
-            t1[7 + i] += t1[7 + i - 1] >> 55;
-            t1[7 + i - 1] &= 0x7fffffffffffffL;
-#ifndef WOLFSSL_SP_DIV_64
-            d1 = t1[7 + i];
-            d1 <<= 55;
-            d1 += t1[7 + i - 1];
-            r1 = (sp_digit)(d1 / dv);
-#else
-            r1 = sp_384_div_word_7(t1[7 + i], t1[7 + i - 1], dv);
-#endif
-
-            sp_384_mul_d_7(t2, d, r1);
+            r1 = t1[7 + i];
+            sp_384_mul_d_7(t2, sd, r1);
             (void)sp_384_sub_7(&t1[i], &t1[i], t2);
-            sp_384_norm_7(&t1[i]);
             t1[7 + i] -= t2[7];
-            t1[7 + i] += t1[7 + i - 1] >> 55;
-            t1[7 + i - 1] &= 0x7fffffffffffffL;
-            r1 = (((-t1[7 + i]) << 55) - t1[7 + i - 1]) / dv;
-            r1++;
-            sp_384_mul_d_7(t2, d, r1);
-            (void)sp_384_add_7(&t1[i], &t1[i], t2);
-            t1[7 + i] += t1[7 + i - 1] >> 55;
-            t1[7 + i - 1] &= 0x7fffffffffffffL;
-        }
-        t1[7 - 1] += t1[7 - 2] >> 55;
-        t1[7 - 2] &= 0x7fffffffffffffL;
-        r1 = t1[7 - 1] / dv;
+            sp_384_norm_7(&t1[i + 1]);
 
-        sp_384_mul_d_7(t2, d, r1);
-        (void)sp_384_sub_7(t1, t1, t2);
-        XMEMCPY(r, t1, sizeof(*r) * 14U);
-        for (i=0; i<6; i++) {
-            r[i+1] += r[i] >> 55;
-            r[i] &= 0x7fffffffffffffL;
+            mask = (sp_digit)0 - ((t1[7 + i] > 0) ?
+                    (sp_digit)1 : (sp_digit)0);
+            sp_384_cond_sub_7(t1 + i, t1 + i, sd, mask);
+            sp_384_norm_7(&t1[i + 1]);
         }
-        sp_384_cond_add_7(r, r, d, 0 - ((r[6] < 0) ?
-                    (sp_digit)1 : (sp_digit)0));
+        sp_384_norm_7(t1);
+        sp_384_rshift_7(r, t1, 1);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -25023,7 +33827,7 @@ static int sp_384_calc_s_7(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int64_t c;
+    sp_int64 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -25135,7 +33939,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int64_t c;
+        sp_int64 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 7U);
         sp_384_norm_7(ctx->r);
@@ -25184,7 +33988,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int64_t c;
+        sp_int64 c;
         sp_384_norm_7(ctx->x);
         carry = sp_384_add_7(ctx->s, ctx->e, ctx->x);
         sp_384_cond_sub_7(ctx->s, ctx->s,
@@ -25254,7 +34058,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int64_t c;
+    sp_int64 c;
     int err = MP_OKAY;
     int i;
 
@@ -25735,7 +34539,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int64_t c = 0;
+        sp_int64 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_384_cmp_7(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -25790,7 +34594,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_384* p2 = NULL;
     sp_digit carry;
-    int64_t c = 0;
+    sp_int64 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -26482,87 +35286,87 @@ typedef struct sp_point_1024 {
 SP_NOINLINE static void sp_1024_mul_9(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
-    int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
-    int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
-                 + ((int128_t)a[ 1]) * b[ 0];
-    int128_t t2   = ((int128_t)a[ 0]) * b[ 2]
-                 + ((int128_t)a[ 1]) * b[ 1]
-                 + ((int128_t)a[ 2]) * b[ 0];
-    int128_t t3   = ((int128_t)a[ 0]) * b[ 3]
-                 + ((int128_t)a[ 1]) * b[ 2]
-                 + ((int128_t)a[ 2]) * b[ 1]
-                 + ((int128_t)a[ 3]) * b[ 0];
-    int128_t t4   = ((int128_t)a[ 0]) * b[ 4]
-                 + ((int128_t)a[ 1]) * b[ 3]
-                 + ((int128_t)a[ 2]) * b[ 2]
-                 + ((int128_t)a[ 3]) * b[ 1]
-                 + ((int128_t)a[ 4]) * b[ 0];
-    int128_t t5   = ((int128_t)a[ 0]) * b[ 5]
-                 + ((int128_t)a[ 1]) * b[ 4]
-                 + ((int128_t)a[ 2]) * b[ 3]
-                 + ((int128_t)a[ 3]) * b[ 2]
-                 + ((int128_t)a[ 4]) * b[ 1]
-                 + ((int128_t)a[ 5]) * b[ 0];
-    int128_t t6   = ((int128_t)a[ 0]) * b[ 6]
-                 + ((int128_t)a[ 1]) * b[ 5]
-                 + ((int128_t)a[ 2]) * b[ 4]
-                 + ((int128_t)a[ 3]) * b[ 3]
-                 + ((int128_t)a[ 4]) * b[ 2]
-                 + ((int128_t)a[ 5]) * b[ 1]
-                 + ((int128_t)a[ 6]) * b[ 0];
-    int128_t t7   = ((int128_t)a[ 0]) * b[ 7]
-                 + ((int128_t)a[ 1]) * b[ 6]
-                 + ((int128_t)a[ 2]) * b[ 5]
-                 + ((int128_t)a[ 3]) * b[ 4]
-                 + ((int128_t)a[ 4]) * b[ 3]
-                 + ((int128_t)a[ 5]) * b[ 2]
-                 + ((int128_t)a[ 6]) * b[ 1]
-                 + ((int128_t)a[ 7]) * b[ 0];
-    int128_t t8   = ((int128_t)a[ 0]) * b[ 8]
-                 + ((int128_t)a[ 1]) * b[ 7]
-                 + ((int128_t)a[ 2]) * b[ 6]
-                 + ((int128_t)a[ 3]) * b[ 5]
-                 + ((int128_t)a[ 4]) * b[ 4]
-                 + ((int128_t)a[ 5]) * b[ 3]
-                 + ((int128_t)a[ 6]) * b[ 2]
-                 + ((int128_t)a[ 7]) * b[ 1]
-                 + ((int128_t)a[ 8]) * b[ 0];
-    int128_t t9   = ((int128_t)a[ 1]) * b[ 8]
-                 + ((int128_t)a[ 2]) * b[ 7]
-                 + ((int128_t)a[ 3]) * b[ 6]
-                 + ((int128_t)a[ 4]) * b[ 5]
-                 + ((int128_t)a[ 5]) * b[ 4]
-                 + ((int128_t)a[ 6]) * b[ 3]
-                 + ((int128_t)a[ 7]) * b[ 2]
-                 + ((int128_t)a[ 8]) * b[ 1];
-    int128_t t10  = ((int128_t)a[ 2]) * b[ 8]
-                 + ((int128_t)a[ 3]) * b[ 7]
-                 + ((int128_t)a[ 4]) * b[ 6]
-                 + ((int128_t)a[ 5]) * b[ 5]
-                 + ((int128_t)a[ 6]) * b[ 4]
-                 + ((int128_t)a[ 7]) * b[ 3]
-                 + ((int128_t)a[ 8]) * b[ 2];
-    int128_t t11  = ((int128_t)a[ 3]) * b[ 8]
-                 + ((int128_t)a[ 4]) * b[ 7]
-                 + ((int128_t)a[ 5]) * b[ 6]
-                 + ((int128_t)a[ 6]) * b[ 5]
-                 + ((int128_t)a[ 7]) * b[ 4]
-                 + ((int128_t)a[ 8]) * b[ 3];
-    int128_t t12  = ((int128_t)a[ 4]) * b[ 8]
-                 + ((int128_t)a[ 5]) * b[ 7]
-                 + ((int128_t)a[ 6]) * b[ 6]
-                 + ((int128_t)a[ 7]) * b[ 5]
-                 + ((int128_t)a[ 8]) * b[ 4];
-    int128_t t13  = ((int128_t)a[ 5]) * b[ 8]
-                 + ((int128_t)a[ 6]) * b[ 7]
-                 + ((int128_t)a[ 7]) * b[ 6]
-                 + ((int128_t)a[ 8]) * b[ 5];
-    int128_t t14  = ((int128_t)a[ 6]) * b[ 8]
-                 + ((int128_t)a[ 7]) * b[ 7]
-                 + ((int128_t)a[ 8]) * b[ 6];
-    int128_t t15  = ((int128_t)a[ 7]) * b[ 8]
-                 + ((int128_t)a[ 8]) * b[ 7];
-    int128_t t16  = ((int128_t)a[ 8]) * b[ 8];
+    sp_int128 t0   = ((sp_int128)a[ 0]) * b[ 0];
+    sp_int128 t1   = ((sp_int128)a[ 0]) * b[ 1]
+                 + ((sp_int128)a[ 1]) * b[ 0];
+    sp_int128 t2   = ((sp_int128)a[ 0]) * b[ 2]
+                 + ((sp_int128)a[ 1]) * b[ 1]
+                 + ((sp_int128)a[ 2]) * b[ 0];
+    sp_int128 t3   = ((sp_int128)a[ 0]) * b[ 3]
+                 + ((sp_int128)a[ 1]) * b[ 2]
+                 + ((sp_int128)a[ 2]) * b[ 1]
+                 + ((sp_int128)a[ 3]) * b[ 0];
+    sp_int128 t4   = ((sp_int128)a[ 0]) * b[ 4]
+                 + ((sp_int128)a[ 1]) * b[ 3]
+                 + ((sp_int128)a[ 2]) * b[ 2]
+                 + ((sp_int128)a[ 3]) * b[ 1]
+                 + ((sp_int128)a[ 4]) * b[ 0];
+    sp_int128 t5   = ((sp_int128)a[ 0]) * b[ 5]
+                 + ((sp_int128)a[ 1]) * b[ 4]
+                 + ((sp_int128)a[ 2]) * b[ 3]
+                 + ((sp_int128)a[ 3]) * b[ 2]
+                 + ((sp_int128)a[ 4]) * b[ 1]
+                 + ((sp_int128)a[ 5]) * b[ 0];
+    sp_int128 t6   = ((sp_int128)a[ 0]) * b[ 6]
+                 + ((sp_int128)a[ 1]) * b[ 5]
+                 + ((sp_int128)a[ 2]) * b[ 4]
+                 + ((sp_int128)a[ 3]) * b[ 3]
+                 + ((sp_int128)a[ 4]) * b[ 2]
+                 + ((sp_int128)a[ 5]) * b[ 1]
+                 + ((sp_int128)a[ 6]) * b[ 0];
+    sp_int128 t7   = ((sp_int128)a[ 0]) * b[ 7]
+                 + ((sp_int128)a[ 1]) * b[ 6]
+                 + ((sp_int128)a[ 2]) * b[ 5]
+                 + ((sp_int128)a[ 3]) * b[ 4]
+                 + ((sp_int128)a[ 4]) * b[ 3]
+                 + ((sp_int128)a[ 5]) * b[ 2]
+                 + ((sp_int128)a[ 6]) * b[ 1]
+                 + ((sp_int128)a[ 7]) * b[ 0];
+    sp_int128 t8   = ((sp_int128)a[ 0]) * b[ 8]
+                 + ((sp_int128)a[ 1]) * b[ 7]
+                 + ((sp_int128)a[ 2]) * b[ 6]
+                 + ((sp_int128)a[ 3]) * b[ 5]
+                 + ((sp_int128)a[ 4]) * b[ 4]
+                 + ((sp_int128)a[ 5]) * b[ 3]
+                 + ((sp_int128)a[ 6]) * b[ 2]
+                 + ((sp_int128)a[ 7]) * b[ 1]
+                 + ((sp_int128)a[ 8]) * b[ 0];
+    sp_int128 t9   = ((sp_int128)a[ 1]) * b[ 8]
+                 + ((sp_int128)a[ 2]) * b[ 7]
+                 + ((sp_int128)a[ 3]) * b[ 6]
+                 + ((sp_int128)a[ 4]) * b[ 5]
+                 + ((sp_int128)a[ 5]) * b[ 4]
+                 + ((sp_int128)a[ 6]) * b[ 3]
+                 + ((sp_int128)a[ 7]) * b[ 2]
+                 + ((sp_int128)a[ 8]) * b[ 1];
+    sp_int128 t10  = ((sp_int128)a[ 2]) * b[ 8]
+                 + ((sp_int128)a[ 3]) * b[ 7]
+                 + ((sp_int128)a[ 4]) * b[ 6]
+                 + ((sp_int128)a[ 5]) * b[ 5]
+                 + ((sp_int128)a[ 6]) * b[ 4]
+                 + ((sp_int128)a[ 7]) * b[ 3]
+                 + ((sp_int128)a[ 8]) * b[ 2];
+    sp_int128 t11  = ((sp_int128)a[ 3]) * b[ 8]
+                 + ((sp_int128)a[ 4]) * b[ 7]
+                 + ((sp_int128)a[ 5]) * b[ 6]
+                 + ((sp_int128)a[ 6]) * b[ 5]
+                 + ((sp_int128)a[ 7]) * b[ 4]
+                 + ((sp_int128)a[ 8]) * b[ 3];
+    sp_int128 t12  = ((sp_int128)a[ 4]) * b[ 8]
+                 + ((sp_int128)a[ 5]) * b[ 7]
+                 + ((sp_int128)a[ 6]) * b[ 6]
+                 + ((sp_int128)a[ 7]) * b[ 5]
+                 + ((sp_int128)a[ 8]) * b[ 4];
+    sp_int128 t13  = ((sp_int128)a[ 5]) * b[ 8]
+                 + ((sp_int128)a[ 6]) * b[ 7]
+                 + ((sp_int128)a[ 7]) * b[ 6]
+                 + ((sp_int128)a[ 8]) * b[ 5];
+    sp_int128 t14  = ((sp_int128)a[ 6]) * b[ 8]
+                 + ((sp_int128)a[ 7]) * b[ 7]
+                 + ((sp_int128)a[ 8]) * b[ 6];
+    sp_int128 t15  = ((sp_int128)a[ 7]) * b[ 8]
+                 + ((sp_int128)a[ 8]) * b[ 7];
+    sp_int128 t16  = ((sp_int128)a[ 8]) * b[ 8];
 
     t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffL;
     t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffL;
@@ -26591,51 +35395,51 @@ SP_NOINLINE static void sp_1024_mul_9(sp_digit* r, const sp_digit* a,
  */
 SP_NOINLINE static void sp_1024_sqr_9(sp_digit* r, const sp_digit* a)
 {
-    int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
-    int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
-    int128_t t2   = (((int128_t)a[ 0]) * a[ 2]) * 2
-                 +  ((int128_t)a[ 1]) * a[ 1];
-    int128_t t3   = (((int128_t)a[ 0]) * a[ 3]
-                 +  ((int128_t)a[ 1]) * a[ 2]) * 2;
-    int128_t t4   = (((int128_t)a[ 0]) * a[ 4]
-                 +  ((int128_t)a[ 1]) * a[ 3]) * 2
-                 +  ((int128_t)a[ 2]) * a[ 2];
-    int128_t t5   = (((int128_t)a[ 0]) * a[ 5]
-                 +  ((int128_t)a[ 1]) * a[ 4]
-                 +  ((int128_t)a[ 2]) * a[ 3]) * 2;
-    int128_t t6   = (((int128_t)a[ 0]) * a[ 6]
-                 +  ((int128_t)a[ 1]) * a[ 5]
-                 +  ((int128_t)a[ 2]) * a[ 4]) * 2
-                 +  ((int128_t)a[ 3]) * a[ 3];
-    int128_t t7   = (((int128_t)a[ 0]) * a[ 7]
-                 +  ((int128_t)a[ 1]) * a[ 6]
-                 +  ((int128_t)a[ 2]) * a[ 5]
-                 +  ((int128_t)a[ 3]) * a[ 4]) * 2;
-    int128_t t8   = (((int128_t)a[ 0]) * a[ 8]
-                 +  ((int128_t)a[ 1]) * a[ 7]
-                 +  ((int128_t)a[ 2]) * a[ 6]
-                 +  ((int128_t)a[ 3]) * a[ 5]) * 2
-                 +  ((int128_t)a[ 4]) * a[ 4];
-    int128_t t9   = (((int128_t)a[ 1]) * a[ 8]
-                 +  ((int128_t)a[ 2]) * a[ 7]
-                 +  ((int128_t)a[ 3]) * a[ 6]
-                 +  ((int128_t)a[ 4]) * a[ 5]) * 2;
-    int128_t t10  = (((int128_t)a[ 2]) * a[ 8]
-                 +  ((int128_t)a[ 3]) * a[ 7]
-                 +  ((int128_t)a[ 4]) * a[ 6]) * 2
-                 +  ((int128_t)a[ 5]) * a[ 5];
-    int128_t t11  = (((int128_t)a[ 3]) * a[ 8]
-                 +  ((int128_t)a[ 4]) * a[ 7]
-                 +  ((int128_t)a[ 5]) * a[ 6]) * 2;
-    int128_t t12  = (((int128_t)a[ 4]) * a[ 8]
-                 +  ((int128_t)a[ 5]) * a[ 7]) * 2
-                 +  ((int128_t)a[ 6]) * a[ 6];
-    int128_t t13  = (((int128_t)a[ 5]) * a[ 8]
-                 +  ((int128_t)a[ 6]) * a[ 7]) * 2;
-    int128_t t14  = (((int128_t)a[ 6]) * a[ 8]) * 2
-                 +  ((int128_t)a[ 7]) * a[ 7];
-    int128_t t15  = (((int128_t)a[ 7]) * a[ 8]) * 2;
-    int128_t t16  =  ((int128_t)a[ 8]) * a[ 8];
+    sp_int128 t0   =  ((sp_int128)a[ 0]) * a[ 0];
+    sp_int128 t1   = (((sp_int128)a[ 0]) * a[ 1]) * 2;
+    sp_int128 t2   = (((sp_int128)a[ 0]) * a[ 2]) * 2
+                 +  ((sp_int128)a[ 1]) * a[ 1];
+    sp_int128 t3   = (((sp_int128)a[ 0]) * a[ 3]
+                 +  ((sp_int128)a[ 1]) * a[ 2]) * 2;
+    sp_int128 t4   = (((sp_int128)a[ 0]) * a[ 4]
+                 +  ((sp_int128)a[ 1]) * a[ 3]) * 2
+                 +  ((sp_int128)a[ 2]) * a[ 2];
+    sp_int128 t5   = (((sp_int128)a[ 0]) * a[ 5]
+                 +  ((sp_int128)a[ 1]) * a[ 4]
+                 +  ((sp_int128)a[ 2]) * a[ 3]) * 2;
+    sp_int128 t6   = (((sp_int128)a[ 0]) * a[ 6]
+                 +  ((sp_int128)a[ 1]) * a[ 5]
+                 +  ((sp_int128)a[ 2]) * a[ 4]) * 2
+                 +  ((sp_int128)a[ 3]) * a[ 3];
+    sp_int128 t7   = (((sp_int128)a[ 0]) * a[ 7]
+                 +  ((sp_int128)a[ 1]) * a[ 6]
+                 +  ((sp_int128)a[ 2]) * a[ 5]
+                 +  ((sp_int128)a[ 3]) * a[ 4]) * 2;
+    sp_int128 t8   = (((sp_int128)a[ 0]) * a[ 8]
+                 +  ((sp_int128)a[ 1]) * a[ 7]
+                 +  ((sp_int128)a[ 2]) * a[ 6]
+                 +  ((sp_int128)a[ 3]) * a[ 5]) * 2
+                 +  ((sp_int128)a[ 4]) * a[ 4];
+    sp_int128 t9   = (((sp_int128)a[ 1]) * a[ 8]
+                 +  ((sp_int128)a[ 2]) * a[ 7]
+                 +  ((sp_int128)a[ 3]) * a[ 6]
+                 +  ((sp_int128)a[ 4]) * a[ 5]) * 2;
+    sp_int128 t10  = (((sp_int128)a[ 2]) * a[ 8]
+                 +  ((sp_int128)a[ 3]) * a[ 7]
+                 +  ((sp_int128)a[ 4]) * a[ 6]) * 2
+                 +  ((sp_int128)a[ 5]) * a[ 5];
+    sp_int128 t11  = (((sp_int128)a[ 3]) * a[ 8]
+                 +  ((sp_int128)a[ 4]) * a[ 7]
+                 +  ((sp_int128)a[ 5]) * a[ 6]) * 2;
+    sp_int128 t12  = (((sp_int128)a[ 4]) * a[ 8]
+                 +  ((sp_int128)a[ 5]) * a[ 7]) * 2
+                 +  ((sp_int128)a[ 6]) * a[ 6];
+    sp_int128 t13  = (((sp_int128)a[ 5]) * a[ 8]
+                 +  ((sp_int128)a[ 6]) * a[ 7]) * 2;
+    sp_int128 t14  = (((sp_int128)a[ 6]) * a[ 8]) * 2
+                 +  ((sp_int128)a[ 7]) * a[ 7];
+    sp_int128 t15  = (((sp_int128)a[ 7]) * a[ 8]) * 2;
+    sp_int128 t16  =  ((sp_int128)a[ 8]) * a[ 8];
 
     t1   += t0  >> 57; r[ 0] = t0  & 0x1ffffffffffffffL;
     t2   += t1  >> 57; r[ 1] = t1  & 0x1ffffffffffffffL;
@@ -26788,30 +35592,33 @@ SP_NOINLINE static void sp_1024_mul_18(sp_digit* r, const sp_digit* a,
     const sp_digit* b)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int128_t c;
+    sp_uint128 c;
+    sp_uint128 lo;
 
-    c = ((int128_t)a[17]) * b[17];
+    c = ((sp_uint128)a[17]) * b[17];
     r[35] = (sp_digit)(c >> 57);
-    c = (c & 0x1ffffffffffffffL) << 57;
+    c &= 0x1ffffffffffffffL;
     for (k = 33; k >= 0; k--) {
-        for (i = 17; i >= 0; i--) {
-            j = k - i;
-            if (j >= 18) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * b[j];
+        if (k >= 18) {
+            i = k - 17;
+            imax = 17;
         }
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
-        c = (c & 0x1ffffffffffffffL) << 57;
+        else {
+            i = 0;
+            imax = k;
+        }
+        lo = 0;
+        for (; i <= imax; i++) {
+            lo += ((sp_uint128)a[i]) * b[k - i];
+        }
+        c += lo >> 57;
+        r[k + 2] += (sp_digit)(c >> 57);
+        r[k + 1]  = (sp_digit)(c & 0x1ffffffffffffffL);
+        c = lo & 0x1ffffffffffffffL;
     }
-    r[0] = (sp_digit)(c >> 57);
+    r[0] = (sp_digit)c;
 }
 
 /* Square a and put result in r. (r = a * a)
@@ -26822,31 +35629,34 @@ SP_NOINLINE static void sp_1024_mul_18(sp_digit* r, const sp_digit* a,
 SP_NOINLINE static void sp_1024_sqr_18(sp_digit* r, const sp_digit* a)
 {
     int i;
-    int j;
+    int imax;
     int k;
-    int128_t c;
+    sp_uint128 c;
+    sp_uint128 t;
 
-    c = ((int128_t)a[17]) * a[17];
+    c = ((sp_uint128)a[17]) * a[17];
     r[35] = (sp_digit)(c >> 57);
     c = (c & 0x1ffffffffffffffL) << 57;
     for (k = 33; k >= 0; k--) {
-        for (i = 17; i >= 0; i--) {
-            j = k - i;
-            if (j >= 18 || i <= j) {
-                break;
-            }
-            if (j < 0) {
-                continue;
-            }
-
-            c += ((int128_t)a[i]) * a[j] * 2;
+        i = (k + 1) / 2;
+        if ((k & 1) == 0) {
+           c += ((sp_uint128)a[i]) * a[i];
+           i++;
         }
-        if (i == j) {
-           c += ((int128_t)a[i]) * a[i];
+        if (k < 17) {
+            imax = k;
         }
+        else {
+            imax = 17;
+        }
+        t = 0;
+        for (; i <= imax; i++) {
+            t += ((sp_uint128)a[i]) * a[k - i];
+        }
+        c += t * 2;
 
-        r[k + 2] += (sp_digit)(c >> 114);
-        r[k + 1] = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
+        r[k + 2] += (sp_digit) (c >> 114);
+        r[k + 1]  = (sp_digit)((c >> 57) & 0x1ffffffffffffffL);
         c = (c & 0x1ffffffffffffffL) << 57;
     }
     r[0] = (sp_digit)(c >> 57);
@@ -26926,7 +35736,7 @@ static const sp_point_1024 p1024_base = {
     0
 };
 
-/* Normalize the values in each word to 57.
+/* Normalize the values in each word to 57 bits.
  *
  * a  Array of sp_digit to normalize.
  */
@@ -26950,8 +35760,8 @@ static void sp_1024_norm_18(sp_digit* a)
         a[i+7] += a[i+6] >> 57; a[i+6] &= 0x1ffffffffffffffL;
         a[i+8] += a[i+7] >> 57; a[i+7] &= 0x1ffffffffffffffL;
     }
-    a[16+1] += a[16] >> 57; a[16] &= 0x1ffffffffffffffL;
-#endif
+    a[17] += a[16] >> 57; a[16] &= 0x1ffffffffffffffL;
+#endif /* WOLFSSL_SP_SMALL */
 }
 
 /* Multiply a by scalar b into r. (r = a * b)
@@ -26964,8 +35774,8 @@ SP_NOINLINE static void sp_1024_mul_d_18(sp_digit* r, const sp_digit* a,
     sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     int i;
 
     for (i = 0; i < 18; i++) {
@@ -26975,10 +35785,10 @@ SP_NOINLINE static void sp_1024_mul_d_18(sp_digit* r, const sp_digit* a,
     }
     r[18] = (sp_digit)t;
 #else
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t = 0;
     sp_digit t2;
-    int128_t p[4];
+    sp_int128 p[4];
     int i;
 
     for (i = 0; i < 16; i += 4) {
@@ -27013,6 +35823,59 @@ SP_NOINLINE static void sp_1024_mul_d_18(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+/* Multiply a by scalar b into r. (r = a * b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A scalar.
+ */
+SP_NOINLINE static void sp_1024_mul_d_36(sp_digit* r, const sp_digit* a,
+    sp_digit b)
+{
+#ifdef WOLFSSL_SP_SMALL
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    int i;
+
+    for (i = 0; i < 36; i++) {
+        t += tb * a[i];
+        r[i] = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+    }
+    r[36] = (sp_digit)t;
+#else
+    sp_int128 tb = b;
+    sp_int128 t = 0;
+    sp_digit t2;
+    sp_int128 p[4];
+    int i;
+
+    for (i = 0; i < 36; i += 4) {
+        p[0] = tb * a[i + 0];
+        p[1] = tb * a[i + 1];
+        p[2] = tb * a[i + 2];
+        p[3] = tb * a[i + 3];
+        t += p[0];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 0] = (sp_digit)t2;
+        t += p[1];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 1] = (sp_digit)t2;
+        t += p[2];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 2] = (sp_digit)t2;
+        t += p[3];
+        t2 = (sp_digit)(t & 0x1ffffffffffffffL);
+        t >>= 57;
+        r[i + 3] = (sp_digit)t2;
+    }
+    r[36] = (sp_digit)(t & 0x1ffffffffffffffL);
+#endif /* WOLFSSL_SP_SMALL */
+}
+
 /* Conditionally add a and b using the mask m.
  * m is -1 to add and 0 when not.
  *
@@ -27086,7 +35949,33 @@ SP_NOINLINE static int sp_1024_add_18(sp_digit* r, const sp_digit* a,
 
     return 0;
 }
-#endif
+#endif /* WOLFSSL_SP_SMALL */
+
+SP_NOINLINE static void sp_1024_rshift_18(sp_digit* r, const sp_digit* a,
+        byte n)
+{
+    int i;
+
+#ifdef WOLFSSL_SP_SMALL
+    for (i=0; i<17; i++) {
+        r[i] = ((a[i] >> n) | (a[i + 1] << (57 - n))) & 0x1ffffffffffffffL;
+    }
+#else
+    for (i=0; i<16; i += 8) {
+        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (57 - n)) & 0x1ffffffffffffffL);
+        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (57 - n)) & 0x1ffffffffffffffL);
+    }
+    r[16] = (a[16] >> n) | ((a[17] << (57 - n)) & 0x1ffffffffffffffL);
+#endif /* WOLFSSL_SP_SMALL */
+    r[17] = a[17] >> n;
+}
+
 #ifdef WOLFSSL_SP_DIV_64
 static WC_INLINE sp_digit sp_1024_div_word_18(sp_digit d1, sp_digit d0,
     sp_digit dv)
@@ -27179,7 +36068,7 @@ static WC_INLINE sp_digit sp_1024_div_word_18(sp_digit d1, sp_digit d0,
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
- * Large number of bits in last word.
+ * Full implementation.
  *
  * a  Number to be divided.
  * d  Number to divide with.
@@ -27187,40 +36076,45 @@ static WC_INLINE sp_digit sp_1024_div_word_18(sp_digit d1, sp_digit d0,
  * r  Remainder from the division.
  * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
  */
-static int sp_1024_div_18(const sp_digit* a, const sp_digit* d, 
+static int sp_1024_div_18(const sp_digit* a, const sp_digit* d,
         const sp_digit* m, sp_digit* r)
 {
     int i;
 #ifndef WOLFSSL_SP_DIV_64
-    int128_t d1;
+    sp_int128 d1;
 #endif
     sp_digit dv;
     sp_digit r1;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* t1 = NULL;
 #else
-    sp_digit t1[3 * 18 + 1];
+    sp_digit t1[4 * 18 + 3];
 #endif
     sp_digit* t2 = NULL;
+    sp_digit* sd = NULL;
     int err = MP_OKAY;
 
     (void)m;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (3 * 18 + 1), NULL,
+    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 18 + 3), NULL,
                                                        DYNAMIC_TYPE_TMP_BUFFER);
     if (t1 == NULL)
         err = MEMORY_E;
 #endif
 
-    if (err == MP_OKAY) {
-        t2 = t1 + 2 * 18;
+    (void)m;
 
-        dv = d[17];
-        XMEMCPY(t1, a, sizeof(*t1) * 2U * 18U);
-        for (i=17; i>=0; i--) {
-            t1[18 + i] += t1[18 + i - 1] >> 57;
-            t1[18 + i - 1] &= 0x1ffffffffffffffL;
+    if (err == MP_OKAY) {
+        t2 = t1 + 36 + 1;
+        sd = t2 + 18 + 1;
+
+        sp_1024_mul_d_18(sd, d, (sp_digit)1 << 2);
+        sp_1024_mul_d_36(t1, a, (sp_digit)1 << 2);
+        dv = sd[17];
+        t1[18 + 18] += t1[18 + 18 - 1] >> 57;
+        t1[18 + 18 - 1] &= 0x1ffffffffffffffL;
+        for (i=18; i>=0; i--) {
 #ifndef WOLFSSL_SP_DIV_64
             d1 = t1[18 + i];
             d1 <<= 57;
@@ -27230,15 +36124,22 @@ static int sp_1024_div_18(const sp_digit* a, const sp_digit* d,
             r1 = sp_1024_div_word_18(t1[18 + i], t1[18 + i - 1], dv);
 #endif
 
-            sp_1024_mul_d_18(t2, d, r1);
+            sp_1024_mul_d_18(t2, sd, r1);
             (void)sp_1024_sub_18(&t1[i], &t1[i], t2);
             sp_1024_norm_18(&t1[i]);
             t1[18 + i] -= t2[18];
             t1[18 + i] += t1[18 + i - 1] >> 57;
             t1[18 + i - 1] &= 0x1ffffffffffffffL;
-            r1 = (((-t1[18 + i]) << 57) - t1[18 + i - 1]) / dv;
-            r1++;
-            sp_1024_mul_d_18(t2, d, r1);
+#ifndef WOLFSSL_SP_DIV_64
+            d1 = -t1[18 + i];
+            d1 <<= 57;
+            d1 -= t1[18 + i - 1];
+            r1 = (sp_digit)(d1 / dv);
+#else
+            r1 = sp_1024_div_word_18(-t1[18 + i], -t1[18 + i - 1], dv);
+#endif
+            r1 -= t1[18 + i];
+            sp_1024_mul_d_18(t2, sd, r1);
             (void)sp_1024_add_18(&t1[i], &t1[i], t2);
             t1[18 + i] += t1[18 + i - 1] >> 57;
             t1[18 + i - 1] &= 0x1ffffffffffffffL;
@@ -27247,15 +36148,18 @@ static int sp_1024_div_18(const sp_digit* a, const sp_digit* d,
         t1[18 - 2] &= 0x1ffffffffffffffL;
         r1 = t1[18 - 1] / dv;
 
-        sp_1024_mul_d_18(t2, d, r1);
-        (void)sp_1024_sub_18(t1, t1, t2);
+        sp_1024_mul_d_18(t2, sd, r1);
+        sp_1024_sub_18(t1, t1, t2);
         XMEMCPY(r, t1, sizeof(*r) * 36U);
         for (i=0; i<17; i++) {
             r[i+1] += r[i] >> 57;
             r[i] &= 0x1ffffffffffffffL;
         }
-        sp_1024_cond_add_18(r, r, d, 0 - ((r[17] < 0) ?
+        sp_1024_cond_add_18(r, r, sd, 0 - ((r[17] < 0) ?
                     (sp_digit)1 : (sp_digit)0));
+
+        sp_1024_norm_18(r);
+        sp_1024_rshift_18(r, r, 2);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -27626,19 +36530,34 @@ SP_NOINLINE static void sp_1024_mul_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
-    int128_t tb = b;
-    int128_t t = 0;
+    sp_int128 tb = b;
+    sp_int128 t[4];
     int i;
 
-    for (i = 0; i < 18; i++) {
-        t += (tb * a[i]) + r[i];
-        r[i] = t & 0x1ffffffffffffffL;
-        t >>= 57;
+    t[0] = 0;
+    for (i = 0; i < 16; i += 4) {
+        t[0] += (tb * a[i+0]) + r[i+0];
+        t[1]  = (tb * a[i+1]) + r[i+1];
+        t[2]  = (tb * a[i+2]) + r[i+2];
+        t[3]  = (tb * a[i+3]) + r[i+3];
+        r[i+0] = t[0] & 0x1ffffffffffffffL;
+        t[1] += t[0] >> 57;
+        r[i+1] = t[1] & 0x1ffffffffffffffL;
+        t[2] += t[1] >> 57;
+        r[i+2] = t[2] & 0x1ffffffffffffffL;
+        t[3] += t[2] >> 57;
+        r[i+3] = t[3] & 0x1ffffffffffffffL;
+        t[0]  = t[3] >> 57;
     }
-    r[18] += (sp_digit)t;
+    t[0] += (tb * a[16]) + r[16];
+    t[1]  = (tb * a[17]) + r[17];
+    r[16] = t[0] & 0x1ffffffffffffffL;
+    t[1] += t[0] >> 57;
+    r[17] = t[1] & 0x1ffffffffffffffL;
+    r[18] +=  (sp_digit)(t[1] >> 57);
 #else
-    int128_t tb = b;
-    int128_t t[8];
+    sp_int128 tb = b;
+    sp_int128 t[8];
     int i;
 
     t[0] = tb * a[0]; r[0] += (sp_digit)(t[0] & 0x1ffffffffffffffL);
@@ -27675,34 +36594,34 @@ static void sp_1024_mont_shift_18(sp_digit* r, const sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
-    word64 n;
+    sp_uint64 n;
 
     n = a[17] >> 55;
     for (i = 0; i < 17; i++) {
-        n += (word64)a[18 + i] << 2;
+        n += (sp_uint64)a[18 + i] << 2;
         r[i] = n & 0x1ffffffffffffffL;
         n >>= 57;
     }
-    n += (word64)a[35] << 2;
+    n += (sp_uint64)a[35] << 2;
     r[17] = n;
 #else
-    word64 n;
+    sp_uint64 n;
     int i;
 
-    n  = (word64)a[17];
+    n  = (sp_uint64)a[17];
     n  = n >> 55U;
     for (i = 0; i < 16; i += 8) {
-        n += (word64)a[i+18] << 2U; r[i+0] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+19] << 2U; r[i+1] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+20] << 2U; r[i+2] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+21] << 2U; r[i+3] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+22] << 2U; r[i+4] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+23] << 2U; r[i+5] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+24] << 2U; r[i+6] = n & 0x1ffffffffffffffUL; n >>= 57U;
-        n += (word64)a[i+25] << 2U; r[i+7] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+18] << 2U; r[i+0] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+19] << 2U; r[i+1] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+20] << 2U; r[i+2] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+21] << 2U; r[i+3] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+22] << 2U; r[i+4] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+23] << 2U; r[i+5] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+24] << 2U; r[i+6] = n & 0x1ffffffffffffffUL; n >>= 57U;
+        n += (sp_uint64)a[i+25] << 2U; r[i+7] = n & 0x1ffffffffffffffUL; n >>= 57U;
     }
-    n += (word64)a[34] << 2U; r[16] = n & 0x1ffffffffffffffUL; n >>= 57U;
-    n += (word64)a[35] << 2U; r[17] = n;
+    n += (sp_uint64)a[34] << 2U; r[16] = n & 0x1ffffffffffffffUL; n >>= 57U;
+    n += (sp_uint64)a[35] << 2U; r[17] = n;
 #endif /* WOLFSSL_SP_SMALL */
     XMEMSET(&r[18], 0, sizeof(*r) * 18U);
 }
@@ -27859,7 +36778,7 @@ static void sp_1024_map_18(sp_point_1024* r, const sp_point_1024* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*18;
-    int64_t n;
+    sp_int64 n;
 
     sp_1024_mont_inv_18(t1, p->z, t + 2*18);
 
@@ -36668,7 +45587,7 @@ static int sp_1024_ecc_is_point_18(const sp_point_1024* point,
     sp_digit t1[18 * 4];
 #endif
     sp_digit* t2 = NULL;
-    int64_t n;
+    sp_int64 n;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c
index d3607ab33..334edb9f2 100644
--- a/wolfcrypt/src/sp_cortexm.c
+++ b/wolfcrypt/src/sp_cortexm.c
@@ -56,6 +56,17 @@
 #endif
 
 #ifdef WOLFSSL_SP_ARM_CORTEX_M_ASM
+#define SP_PRINT_NUM(var, name, total, words, bits)     \
+    do {                                                \
+        int ii                                          \
+        fprintf(stderr, name "=0x");                    \
+        for (ii = words - 1; ii >= 0; ii--)             \
+            fprintf(stderr, SP_PRINT_FMT, (var)[ii]);   \
+        fprintf(stderr, "\n");                         \
+    } while (0)
+
+#define SP_PRINT_VAL(var, name)                         \
+    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
 /* Read big endian unsigned byte array into r.
@@ -184,7 +195,7 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_2048_to_bin(sp_digit* r, byte* a)
+static void sp_2048_to_bin_64(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -218,6 +229,18 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_64(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_64(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -3058,7 +3081,7 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0,
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 
@@ -3729,7 +3752,7 @@ static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 
@@ -3887,6 +3910,137 @@ static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 128];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 128), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 128;
+        }
+
+        sp_2048_mont_setup(m, &mp);
+        sp_2048_mont_norm_64(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
+        if (reduceA != 0) {
+            err = sp_2048_mod_64(t[1] + 64, a, m);
+            if (err == MP_OKAY) {
+                err = sp_2048_mod_64(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
+            err = sp_2048_mod_64(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
+        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
+        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
+        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_2048_mont_sqr_64(r, r, m, mp);
+            sp_2048_mont_sqr_64(r, r, m, mp);
+            sp_2048_mont_sqr_64(r, r, m, mp);
+
+            sp_2048_mont_mul_64(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
+        sp_2048_mont_reduce_64(r, m, mp);
+
+        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
+        sp_2048_cond_sub_64(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -4007,163 +4161,6 @@ static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_2048_cond_sub_64(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 128];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 128), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 128;
-        }
-
-        sp_2048_mont_setup(m, &mp);
-        sp_2048_mont_norm_64(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
-        if (reduceA != 0) {
-            err = sp_2048_mod_64(t[1] + 64, a, m);
-            if (err == MP_OKAY) {
-                err = sp_2048_mod_64(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
-            err = sp_2048_mod_64(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
-        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
-        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
-        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_2048_mont_sqr_64(t[ 8], t[ 4], m, mp);
-        sp_2048_mont_mul_64(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_2048_mont_sqr_64(t[10], t[ 5], m, mp);
-        sp_2048_mont_mul_64(t[11], t[ 6], t[ 5], m, mp);
-        sp_2048_mont_sqr_64(t[12], t[ 6], m, mp);
-        sp_2048_mont_mul_64(t[13], t[ 7], t[ 6], m, mp);
-        sp_2048_mont_sqr_64(t[14], t[ 7], m, mp);
-        sp_2048_mont_mul_64(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_64(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_64(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_64(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_64(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_64(t[20], t[10], m, mp);
-        sp_2048_mont_mul_64(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_64(t[22], t[11], m, mp);
-        sp_2048_mont_mul_64(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_64(t[24], t[12], m, mp);
-        sp_2048_mont_mul_64(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_64(t[26], t[13], m, mp);
-        sp_2048_mont_mul_64(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_64(t[28], t[14], m, mp);
-        sp_2048_mont_mul_64(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_64(t[30], t[15], m, mp);
-        sp_2048_mont_mul_64(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-            sp_2048_mont_sqr_64(r, r, m, mp);
-
-            sp_2048_mont_mul_64(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
-        sp_2048_mont_reduce_64(r, m, mp);
-
-        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
-        sp_2048_cond_sub_64(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -4194,7 +4191,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[64 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -4292,7 +4289,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -4423,7 +4420,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -4514,7 +4511,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[32], 0, sizeof(sp_digit) * 32);
         sp_2048_add_64(r, tmpb, tmpa);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
     }
 
@@ -5222,7 +5219,7 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_64(r, out);
         *outLen = 256;
         for (i=0; i<256 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -5421,7 +5418,7 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_3072_to_bin(sp_digit* r, byte* a)
+static void sp_3072_to_bin_96(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -5455,6 +5452,18 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_96(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_96(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Multiply a and b into r. (r = a * b)
  *
@@ -7952,7 +7961,7 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0,
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 
@@ -8624,7 +8633,7 @@ static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 
@@ -8784,6 +8793,137 @@ static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const s
 static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 192];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 192), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 192;
+        }
+
+        sp_3072_mont_setup(m, &mp);
+        sp_3072_mont_norm_96(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 96U);
+        if (reduceA != 0) {
+            err = sp_3072_mod_96(t[1] + 96, a, m);
+            if (err == MP_OKAY) {
+                err = sp_3072_mod_96(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 96, a, sizeof(sp_digit) * 96);
+            err = sp_3072_mod_96(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_3072_mont_sqr_96(t[ 2], t[ 1], m, mp);
+        sp_3072_mont_mul_96(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_3072_mont_sqr_96(t[ 4], t[ 2], m, mp);
+        sp_3072_mont_mul_96(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_3072_mont_sqr_96(t[ 6], t[ 3], m, mp);
+        sp_3072_mont_mul_96(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 96);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_3072_mont_sqr_96(r, r, m, mp);
+            sp_3072_mont_sqr_96(r, r, m, mp);
+            sp_3072_mont_sqr_96(r, r, m, mp);
+
+            sp_3072_mont_mul_96(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
+        sp_3072_mont_reduce_96(r, m, mp);
+
+        mask = 0 - (sp_3072_cmp_96(r, m) >= 0);
+        sp_3072_cond_sub_96(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -8904,163 +9044,6 @@ static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_3072_cond_sub_96(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 192];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 192), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 192;
-        }
-
-        sp_3072_mont_setup(m, &mp);
-        sp_3072_mont_norm_96(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 96U);
-        if (reduceA != 0) {
-            err = sp_3072_mod_96(t[1] + 96, a, m);
-            if (err == MP_OKAY) {
-                err = sp_3072_mod_96(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 96, a, sizeof(sp_digit) * 96);
-            err = sp_3072_mod_96(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_3072_mont_sqr_96(t[ 2], t[ 1], m, mp);
-        sp_3072_mont_mul_96(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_3072_mont_sqr_96(t[ 4], t[ 2], m, mp);
-        sp_3072_mont_mul_96(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_3072_mont_sqr_96(t[ 6], t[ 3], m, mp);
-        sp_3072_mont_mul_96(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_3072_mont_sqr_96(t[ 8], t[ 4], m, mp);
-        sp_3072_mont_mul_96(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_3072_mont_sqr_96(t[10], t[ 5], m, mp);
-        sp_3072_mont_mul_96(t[11], t[ 6], t[ 5], m, mp);
-        sp_3072_mont_sqr_96(t[12], t[ 6], m, mp);
-        sp_3072_mont_mul_96(t[13], t[ 7], t[ 6], m, mp);
-        sp_3072_mont_sqr_96(t[14], t[ 7], m, mp);
-        sp_3072_mont_mul_96(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_96(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_96(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_96(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_96(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_96(t[20], t[10], m, mp);
-        sp_3072_mont_mul_96(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_96(t[22], t[11], m, mp);
-        sp_3072_mont_mul_96(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_96(t[24], t[12], m, mp);
-        sp_3072_mont_mul_96(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_96(t[26], t[13], m, mp);
-        sp_3072_mont_mul_96(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_96(t[28], t[14], m, mp);
-        sp_3072_mont_mul_96(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_96(t[30], t[15], m, mp);
-        sp_3072_mont_mul_96(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 96);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-            sp_3072_mont_sqr_96(r, r, m, mp);
-
-            sp_3072_mont_mul_96(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[96], 0, sizeof(sp_digit) * 96U);
-        sp_3072_mont_reduce_96(r, m, mp);
-
-        mask = 0 - (sp_3072_cmp_96(r, m) >= 0);
-        sp_3072_cond_sub_96(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -9091,7 +9074,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[96 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -9189,7 +9172,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -9320,7 +9303,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -9411,7 +9394,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[48], 0, sizeof(sp_digit) * 48);
         sp_3072_add_96(r, tmpb, tmpa);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
     }
 
@@ -10315,7 +10298,7 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_96(r, out);
         *outLen = 384;
         for (i=0; i<384 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -10514,7 +10497,7 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_4096_to_bin(sp_digit* r, byte* a)
+static void sp_4096_to_bin_128(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -10548,6 +10531,18 @@ static void sp_4096_to_bin(sp_digit* r, byte* a)
     }
 }
 
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_128(a)
+
+/* Normalize the values in each word to 32.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_128(a)
+
 #ifndef WOLFSSL_SP_SMALL
 /* Sub b from a into r. (r = a - b)
  *
@@ -11946,7 +11941,7 @@ static void sp_4096_mask_128(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_4096_cmp_128(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_4096_cmp_128(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 
@@ -12106,6 +12101,137 @@ static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const
 static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
         int bits, const sp_digit* m, int reduceA)
 {
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    sp_digit* td = NULL;
+#else
+    sp_digit td[8 * 256];
+#endif
+    sp_digit* t[8];
+    sp_digit* norm = NULL;
+    sp_digit mp = 1;
+    sp_digit n;
+    sp_digit mask;
+    int i;
+    int c;
+    byte y;
+    int err = MP_OKAY;
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 256), NULL,
+                            DYNAMIC_TYPE_TMP_BUFFER);
+    if (td == NULL)
+        err = MEMORY_E;
+#endif
+
+    if (err == MP_OKAY) {
+        norm = td;
+        for (i=0; i<8; i++) {
+            t[i] = td + i * 256;
+        }
+
+        sp_4096_mont_setup(m, &mp);
+        sp_4096_mont_norm_128(norm, m);
+
+        XMEMSET(t[1], 0, sizeof(sp_digit) * 128U);
+        if (reduceA != 0) {
+            err = sp_4096_mod_128(t[1] + 128, a, m);
+            if (err == MP_OKAY) {
+                err = sp_4096_mod_128(t[1], t[1], m);
+            }
+        }
+        else {
+            XMEMCPY(t[1] + 128, a, sizeof(sp_digit) * 128);
+            err = sp_4096_mod_128(t[1], t[1], m);
+        }
+    }
+
+    if (err == MP_OKAY) {
+        sp_4096_mont_sqr_128(t[ 2], t[ 1], m, mp);
+        sp_4096_mont_mul_128(t[ 3], t[ 2], t[ 1], m, mp);
+        sp_4096_mont_sqr_128(t[ 4], t[ 2], m, mp);
+        sp_4096_mont_mul_128(t[ 5], t[ 3], t[ 2], m, mp);
+        sp_4096_mont_sqr_128(t[ 6], t[ 3], m, mp);
+        sp_4096_mont_mul_128(t[ 7], t[ 4], t[ 3], m, mp);
+
+        i = (bits - 1) / 32;
+        n = e[i--];
+        c = bits & 31;
+        if (c == 0) {
+            c = 32;
+        }
+        c -= bits % 3;
+        if (c == 32) {
+            c = 29;
+        }
+        if (c < 0) {
+            /* Number of bits in top word is less than number needed. */
+            c = -c;
+            y = (byte)(n << c);
+            n = e[i--];
+            y |= (byte)(n >> (64 - c));
+            n <<= c;
+            c = 64 - c;
+        }
+        else {
+            y = (byte)(n >> c);
+            n <<= 32 - c;
+        }
+        XMEMCPY(r, t[y], sizeof(sp_digit) * 128);
+        for (; i>=0 || c>=3; ) {
+            if (c == 0) {
+                n = e[i--];
+                y = (byte)(n >> 29);
+                n <<= 3;
+                c = 29;
+            }
+            else if (c < 3) {
+                y = (byte)(n >> 29);
+                n = e[i--];
+                c = 3 - c;
+                y |= (byte)(n >> (32 - c));
+                n <<= c;
+                c = 32 - c;
+            }
+            else {
+                y = (byte)((n >> 29) & 0x7);
+                n <<= 3;
+                c -= 3;
+            }
+
+            sp_4096_mont_sqr_128(r, r, m, mp);
+            sp_4096_mont_sqr_128(r, r, m, mp);
+            sp_4096_mont_sqr_128(r, r, m, mp);
+
+            sp_4096_mont_mul_128(r, r, t[y], m, mp);
+        }
+
+        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
+        sp_4096_mont_reduce_128(r, m, mp);
+
+        mask = 0 - (sp_4096_cmp_128(r, m) >= 0);
+        sp_4096_cond_sub_128(r, r, m, mask);
+    }
+
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
+    if (td != NULL)
+        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+    return err;
+}
+#else
+/* Modular exponentiate a to the e mod m. (r = a^e mod m)
+ *
+ * r     A single precision number that is the result of the operation.
+ * a     A single precision number being exponentiated.
+ * e     A single precision number that is the exponent.
+ * bits  The number of bits in the exponent.
+ * m     A single precision number that is the modulus.
+ * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
+ */
+static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
+        int bits, const sp_digit* m, int reduceA)
+{
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td = NULL;
 #else
@@ -12226,163 +12352,6 @@ static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e
         sp_4096_cond_sub_128(r, r, m, mask);
     }
 
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    if (td != NULL)
-        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-
-    return err;
-}
-#else
-/* Modular exponentiate a to the e mod m. (r = a^e mod m)
- *
- * r     A single precision number that is the result of the operation.
- * a     A single precision number being exponentiated.
- * e     A single precision number that is the exponent.
- * bits  The number of bits in the exponent.
- * m     A single precision number that is the modulus.
- * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
- */
-static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e,
-        int bits, const sp_digit* m, int reduceA)
-{
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    sp_digit* td = NULL;
-#else
-    sp_digit td[32 * 256];
-#endif
-    sp_digit* t[32];
-    sp_digit* norm = NULL;
-    sp_digit mp = 1;
-    sp_digit n;
-    sp_digit mask;
-    int i;
-    int c;
-    byte y;
-    int err = MP_OKAY;
-
-#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 256), NULL,
-                            DYNAMIC_TYPE_TMP_BUFFER);
-    if (td == NULL)
-        err = MEMORY_E;
-#endif
-
-    if (err == MP_OKAY) {
-        norm = td;
-        for (i=0; i<32; i++) {
-            t[i] = td + i * 256;
-        }
-
-        sp_4096_mont_setup(m, &mp);
-        sp_4096_mont_norm_128(norm, m);
-
-        XMEMSET(t[1], 0, sizeof(sp_digit) * 128U);
-        if (reduceA != 0) {
-            err = sp_4096_mod_128(t[1] + 128, a, m);
-            if (err == MP_OKAY) {
-                err = sp_4096_mod_128(t[1], t[1], m);
-            }
-        }
-        else {
-            XMEMCPY(t[1] + 128, a, sizeof(sp_digit) * 128);
-            err = sp_4096_mod_128(t[1], t[1], m);
-        }
-    }
-
-    if (err == MP_OKAY) {
-        sp_4096_mont_sqr_128(t[ 2], t[ 1], m, mp);
-        sp_4096_mont_mul_128(t[ 3], t[ 2], t[ 1], m, mp);
-        sp_4096_mont_sqr_128(t[ 4], t[ 2], m, mp);
-        sp_4096_mont_mul_128(t[ 5], t[ 3], t[ 2], m, mp);
-        sp_4096_mont_sqr_128(t[ 6], t[ 3], m, mp);
-        sp_4096_mont_mul_128(t[ 7], t[ 4], t[ 3], m, mp);
-        sp_4096_mont_sqr_128(t[ 8], t[ 4], m, mp);
-        sp_4096_mont_mul_128(t[ 9], t[ 5], t[ 4], m, mp);
-        sp_4096_mont_sqr_128(t[10], t[ 5], m, mp);
-        sp_4096_mont_mul_128(t[11], t[ 6], t[ 5], m, mp);
-        sp_4096_mont_sqr_128(t[12], t[ 6], m, mp);
-        sp_4096_mont_mul_128(t[13], t[ 7], t[ 6], m, mp);
-        sp_4096_mont_sqr_128(t[14], t[ 7], m, mp);
-        sp_4096_mont_mul_128(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_128(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_128(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_128(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_128(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_128(t[20], t[10], m, mp);
-        sp_4096_mont_mul_128(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_128(t[22], t[11], m, mp);
-        sp_4096_mont_mul_128(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_128(t[24], t[12], m, mp);
-        sp_4096_mont_mul_128(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_128(t[26], t[13], m, mp);
-        sp_4096_mont_mul_128(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_128(t[28], t[14], m, mp);
-        sp_4096_mont_mul_128(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_128(t[30], t[15], m, mp);
-        sp_4096_mont_mul_128(t[31], t[16], t[15], m, mp);
-
-        i = (bits - 1) / 32;
-        n = e[i--];
-        c = bits & 31;
-        if (c == 0) {
-            c = 32;
-        }
-        c -= bits % 5;
-        if (c == 32) {
-            c = 27;
-        }
-        if (c < 0) {
-            /* Number of bits in top word is less than number needed. */
-            c = -c;
-            y = (byte)(n << c);
-            n = e[i--];
-            y |= (byte)(n >> (64 - c));
-            n <<= c;
-            c = 64 - c;
-        }
-        else {
-            y = (byte)(n >> c);
-            n <<= 32 - c;
-        }
-        XMEMCPY(r, t[y], sizeof(sp_digit) * 128);
-        for (; i>=0 || c>=5; ) {
-            if (c == 0) {
-                n = e[i--];
-                y = (byte)(n >> 27);
-                n <<= 5;
-                c = 27;
-            }
-            else if (c < 5) {
-                y = (byte)(n >> 27);
-                n = e[i--];
-                c = 5 - c;
-                y |= (byte)(n >> (32 - c));
-                n <<= c;
-                c = 32 - c;
-            }
-            else {
-                y = (byte)((n >> 27) & 0x1f);
-                n <<= 5;
-                c -= 5;
-            }
-
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-            sp_4096_mont_sqr_128(r, r, m, mp);
-
-            sp_4096_mont_mul_128(r, r, t[y], m, mp);
-        }
-
-        XMEMSET(&r[128], 0, sizeof(sp_digit) * 128U);
-        sp_4096_mont_reduce_128(r, m, mp);
-
-        mask = 0 - (sp_4096_cmp_128(r, m) >= 0);
-        sp_4096_cond_sub_128(r, r, m, mask);
-    }
-
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     if (td != NULL)
         XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
@@ -12413,7 +12382,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     sp_digit* a = NULL;
 #else
     sp_digit a[128 * 5];
-#endif    
+#endif
     sp_digit* m = NULL;
     sp_digit* r = NULL;
     sp_digit *ah = NULL;
@@ -12511,7 +12480,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -12643,7 +12612,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -12734,7 +12703,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[64], 0, sizeof(sp_digit) * 64);
         sp_4096_add_128(r, tmpb, tmpa);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
     }
 
@@ -13834,7 +13803,7 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_128(r, out);
         *outLen = 512;
         for (i=0; i<512 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -16685,7 +16654,7 @@ static void sp_256_mont_inv_8(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 
@@ -17009,7 +16978,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*8;
-    int32_t n;
+    sp_int32 n;
 
     sp_256_mont_inv_8(t1, p->z, t + 2*8);
 
@@ -21072,7 +21041,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_256_to_bin(sp_digit* r, byte* a)
+static void sp_256_to_bin_8(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -21155,7 +21124,7 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_256_ecc_mulmod_8(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_256_to_bin(point->x, out);
+        sp_256_to_bin_8(point->x, out);
         *outLen = 32;
     }
 
@@ -21648,7 +21617,7 @@ static int sp_256_calc_s_8(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int32_t c;
+    sp_int32 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -21760,7 +21729,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int32_t c;
+        sp_int32 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 8U);
         sp_256_norm_8(ctx->r);
@@ -21809,7 +21778,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int32_t c;
+        sp_int32 c;
         sp_256_norm_8(ctx->x);
         carry = sp_256_add_8(ctx->s, ctx->e, ctx->x);
         sp_256_cond_sub_8(ctx->s, ctx->s,
@@ -21879,7 +21848,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int32_t c;
+    sp_int32 c;
     int err = MP_OKAY;
     int i;
 
@@ -22489,7 +22458,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int32_t c = 0;
+        sp_int32 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_256_cmp_8(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -22544,7 +22513,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_256* p2 = NULL;
     sp_digit carry;
-    int32_t c = 0;
+    sp_int32 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -24272,7 +24241,7 @@ static void sp_384_mont_inv_12(sp_digit* r, const sp_digit* a, sp_digit* td)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 
@@ -24329,7 +24298,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*12;
-    int32_t n;
+    sp_int32 n;
 
     sp_384_mont_inv_12(t1, p->z, t + 2*12);
 
@@ -28270,7 +28239,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_384_to_bin(sp_digit* r, byte* a)
+static void sp_384_to_bin_12(sp_digit* r, byte* a)
 {
     int i;
     int j;
@@ -28353,7 +28322,7 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_384_ecc_mulmod_12(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_384_to_bin(point->x, out);
+        sp_384_to_bin_12(point->x, out);
         *outLen = 48;
     }
 
@@ -28831,7 +28800,7 @@ static int sp_384_calc_s_12(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int32_t c;
+    sp_int32 c;
     sp_digit* kInv = k;
 
     /* Conv k to Montgomery form (mod order) */
@@ -28943,7 +28912,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int32_t c;
+        sp_int32 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 12U);
         sp_384_norm_12(ctx->r);
@@ -28992,7 +28961,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int32_t c;
+        sp_int32 c;
         sp_384_norm_12(ctx->x);
         carry = sp_384_add_12(ctx->s, ctx->e, ctx->x);
         sp_384_cond_sub_12(ctx->s, ctx->s,
@@ -29062,7 +29031,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int32_t c;
+    sp_int32 c;
     int err = MP_OKAY;
     int i;
 
@@ -29720,7 +29689,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int32_t c = 0;
+        sp_int32 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_384_cmp_12(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -29775,7 +29744,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_384* p2 = NULL;
     sp_digit carry;
-    int32_t c = 0;
+    sp_int32 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -31579,7 +31548,7 @@ static void sp_1024_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
  * return -ve, 0 or +ve if a is less than, equal to or greater than b
  * respectively.
  */
-SP_NOINLINE static int32_t sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
 {
     sp_digit r = 0;
 
@@ -32149,7 +32118,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*32;
-    int32_t n;
+    sp_int32 n;
 
     sp_1024_mont_inv_32(t1, p->z, t + 2*32);
 
@@ -42308,7 +42277,7 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     sp_digit t1[32 * 4];
 #endif
     sp_digit* t2 = NULL;
-    int32_t n;
+    sp_int32 n;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c
index c4e26e1b8..7b23b1f5c 100644
--- a/wolfcrypt/src/sp_x86_64.c
+++ b/wolfcrypt/src/sp_x86_64.c
@@ -47,6 +47,17 @@
 #include <wolfssl/wolfcrypt/sp.h>
 
 #ifdef WOLFSSL_SP_X86_64_ASM
+#define SP_PRINT_NUM(var, name, total, words, bits)     \
+    do {                                                \
+        int ii                                          \
+        fprintf(stderr, name "=0x");                    \
+        for (ii = words - 1; ii >= 0; ii--)             \
+            fprintf(stderr, SP_PRINT_FMT, (var)[ii]);   \
+        fprintf(stderr, "\n");                         \
+    } while (0)
+
+#define SP_PRINT_VAL(var, name)                         \
+    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
 extern void sp_2048_from_bin_bswap(sp_digit* r, int size, const byte* a, int n);
@@ -158,29 +169,41 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
 #endif
 }
 
-extern void sp_2048_to_bin_bswap(sp_digit* r, byte* a);
-extern void sp_2048_to_bin_movbe(sp_digit* r, byte* a);
+extern void sp_2048_to_bin_bswap_32(sp_digit* r, byte* a);
+extern void sp_2048_to_bin_movbe_32(sp_digit* r, byte* a);
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 256
  *
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_2048_to_bin(sp_digit* r, byte* a)
+static void sp_2048_to_bin_32(sp_digit* r, byte* a)
 {
 #ifndef NO_MOVBE_SUPPORT
     word32 cpuid_flags = cpuid_get_flags();
 
     if (IS_INTEL_MOVBE(cpuid_flags)) {
-        sp_2048_to_bin_movbe(r, a);
+        sp_2048_to_bin_movbe_32(r, a);
     }
     else
 #endif
     {
-        sp_2048_to_bin_bswap(r, a);
+        sp_2048_to_bin_bswap_32(r, a);
     }
 }
 
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_32(a)
+
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_2048_norm_32(a)
+
 extern void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b);
 extern void sp_2048_sqr_16(sp_digit* r, const sp_digit* a);
 extern void sp_2048_mul_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* b);
@@ -344,7 +367,7 @@ static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m)
 #endif
 }
 
-extern int64_t sp_2048_cmp_16(const sp_digit* a, const sp_digit* b);
+extern sp_int64 sp_2048_cmp_16(const sp_digit* a, const sp_digit* b);
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -928,7 +951,7 @@ static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
 #endif
 }
 
-extern int64_t sp_2048_cmp_32(const sp_digit* a, const sp_digit* b);
+extern sp_int64 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b);
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -1099,9 +1122,9 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td;
 #else
-    sp_digit td[(33 * 64) + 64];
+    sp_digit td[(17 * 64) + 64];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm;
     sp_digit mp = 1;
@@ -1113,7 +1136,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (33 * 64) + 64, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (17 * 64) + 64, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL) {
         err = MEMORY_E;
@@ -1123,13 +1146,13 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 64;
-        rt = td + 2048;
+        rt = td + 1024;
 #else
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = &td[i * 64];
-        rt = &td[2048];
+        rt = &td[1024];
 #endif
 
         sp_2048_mont_setup(m, &mp);
@@ -1162,22 +1185,6 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp);
         sp_2048_mont_sqr_32(t[14], t[ 7], m, mp);
         sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_32(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_32(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_32(t[20], t[10], m, mp);
-        sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_32(t[22], t[11], m, mp);
-        sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_32(t[24], t[12], m, mp);
-        sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_32(t[26], t[13], m, mp);
-        sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_32(t[28], t[14], m, mp);
-        sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_32(t[30], t[15], m, mp);
-        sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp);
 
         i = (bits - 1) / 64;
         n = e[i--];
@@ -1186,12 +1193,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
         if (c == 0) {
             c = 64;
         }
-        /* Minus the number of top bits to use so rest is a multiple of 5. */
-        if ((bits % 5) == 0) {
-            c -= 5;
+        /* Minus the number of top bits to use so rest is a multiple of 4. */
+        if ((bits % 4) == 0) {
+            c -= 4;
         }
         else {
-            c -= bits % 5;
+            c -= bits % 4;
         }
         if (c < 0) {
             /* Number of bits in top word is less than number needed. */
@@ -1207,22 +1214,22 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
             n <<= 64 - c;
         }
         XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
-        for (; i>=0 || c>=5; ) {
-            if (c >= 5) {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
+        for (; i>=0 || c>=4; ) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
             }
             else if (c == 0) {
                 n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
+                y = (byte)(n >> 60);
+                n <<= 4;
+                c = 60;
             }
             else {
-                y = (byte)(n >> 59);
+                y = (byte)(n >> 60);
                 n = e[i--];
-                c = 5 - c;
+                c = 4 - c;
                 y |= (byte)(n >> (64 - c));
                 n <<= c;
                 c = 64 - c;
@@ -1236,10 +1243,8 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
             sp_2048_mont_reduce_32(rt, m, mp);
             sp_2048_sqr_32(r, rt);
             sp_2048_mont_reduce_32(r, m, mp);
-            sp_2048_sqr_32(rt, r);
-            sp_2048_mont_reduce_32(rt, m, mp);
 
-            sp_2048_mul_32(r, rt, t[y]);
+            sp_2048_mul_32(r, r, t[y]);
             sp_2048_mont_reduce_32(r, m, mp);
         }
 
@@ -1311,9 +1316,9 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td;
 #else
-    sp_digit td[(33 * 64) + 64];
+    sp_digit td[(17 * 64) + 64];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm;
     sp_digit mp = 1;
@@ -1325,7 +1330,7 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (33 * 64) + 64, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (17 * 64) + 64, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL) {
         err = MEMORY_E;
@@ -1335,13 +1340,13 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi
     if (err == MP_OKAY) {
         norm = td;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 64;
-        rt = td + 2048;
+        rt = td + 1024;
 #else
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = &td[i * 64];
-        rt = &td[2048];
+        rt = &td[1024];
 #endif
 
         sp_2048_mont_setup(m, &mp);
@@ -1374,22 +1379,6 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi
         sp_2048_mont_mul_avx2_32(t[13], t[ 7], t[ 6], m, mp);
         sp_2048_mont_sqr_avx2_32(t[14], t[ 7], m, mp);
         sp_2048_mont_mul_avx2_32(t[15], t[ 8], t[ 7], m, mp);
-        sp_2048_mont_sqr_avx2_32(t[16], t[ 8], m, mp);
-        sp_2048_mont_mul_avx2_32(t[17], t[ 9], t[ 8], m, mp);
-        sp_2048_mont_sqr_avx2_32(t[18], t[ 9], m, mp);
-        sp_2048_mont_mul_avx2_32(t[19], t[10], t[ 9], m, mp);
-        sp_2048_mont_sqr_avx2_32(t[20], t[10], m, mp);
-        sp_2048_mont_mul_avx2_32(t[21], t[11], t[10], m, mp);
-        sp_2048_mont_sqr_avx2_32(t[22], t[11], m, mp);
-        sp_2048_mont_mul_avx2_32(t[23], t[12], t[11], m, mp);
-        sp_2048_mont_sqr_avx2_32(t[24], t[12], m, mp);
-        sp_2048_mont_mul_avx2_32(t[25], t[13], t[12], m, mp);
-        sp_2048_mont_sqr_avx2_32(t[26], t[13], m, mp);
-        sp_2048_mont_mul_avx2_32(t[27], t[14], t[13], m, mp);
-        sp_2048_mont_sqr_avx2_32(t[28], t[14], m, mp);
-        sp_2048_mont_mul_avx2_32(t[29], t[15], t[14], m, mp);
-        sp_2048_mont_sqr_avx2_32(t[30], t[15], m, mp);
-        sp_2048_mont_mul_avx2_32(t[31], t[16], t[15], m, mp);
 
         i = (bits - 1) / 64;
         n = e[i--];
@@ -1398,12 +1387,12 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi
         if (c == 0) {
             c = 64;
         }
-        /* Minus the number of top bits to use so rest is a multiple of 5. */
-        if ((bits % 5) == 0) {
-            c -= 5;
+        /* Minus the number of top bits to use so rest is a multiple of 4. */
+        if ((bits % 4) == 0) {
+            c -= 4;
         }
         else {
-            c -= bits % 5;
+            c -= bits % 4;
         }
         if (c < 0) {
             /* Number of bits in top word is less than number needed. */
@@ -1419,22 +1408,22 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi
             n <<= 64 - c;
         }
         XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
-        for (; i>=0 || c>=5; ) {
-            if (c >= 5) {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
+        for (; i>=0 || c>=4; ) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
             }
             else if (c == 0) {
                 n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
+                y = (byte)(n >> 60);
+                n <<= 4;
+                c = 60;
             }
             else {
-                y = (byte)(n >> 59);
+                y = (byte)(n >> 60);
                 n = e[i--];
-                c = 5 - c;
+                c = 4 - c;
                 y |= (byte)(n >> (64 - c));
                 n <<= c;
                 c = 64 - c;
@@ -1448,10 +1437,8 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi
             sp_2048_mont_reduce_avx2_32(rt, m, mp);
             sp_2048_sqr_avx2_32(r, rt);
             sp_2048_mont_reduce_avx2_32(r, m, mp);
-            sp_2048_sqr_avx2_32(rt, r);
-            sp_2048_mont_reduce_avx2_32(rt, m, mp);
 
-            sp_2048_mul_avx2_32(r, rt, t[y]);
+            sp_2048_mul_avx2_32(r, r, t[y]);
             sp_2048_mont_reduce_avx2_32(r, m, mp);
         }
 
@@ -1619,7 +1606,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_32(r, out);
         *outLen = 256;
     }
 
@@ -1710,7 +1697,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_32(r, out);
         *outLen = 256;
     }
 
@@ -1863,7 +1850,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[16], 0, sizeof(sp_digit) * 16);
         sp_2048_add_32(r, tmpb, tmpa);
 
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_32(r, out);
         *outLen = 256;
     }
 
@@ -2318,7 +2305,7 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_2048_to_bin(r, out);
+        sp_2048_to_bin_32(r, out);
         *outLen = 256;
         for (i=0; i<256 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -2499,29 +2486,41 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
 #endif
 }
 
-extern void sp_3072_to_bin_bswap(sp_digit* r, byte* a);
-extern void sp_3072_to_bin_movbe(sp_digit* r, byte* a);
+extern void sp_3072_to_bin_bswap_48(sp_digit* r, byte* a);
+extern void sp_3072_to_bin_movbe_48(sp_digit* r, byte* a);
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 384
  *
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_3072_to_bin(sp_digit* r, byte* a)
+static void sp_3072_to_bin_48(sp_digit* r, byte* a)
 {
 #ifndef NO_MOVBE_SUPPORT
     word32 cpuid_flags = cpuid_get_flags();
 
     if (IS_INTEL_MOVBE(cpuid_flags)) {
-        sp_3072_to_bin_movbe(r, a);
+        sp_3072_to_bin_movbe_48(r, a);
     }
     else
 #endif
     {
-        sp_3072_to_bin_bswap(r, a);
+        sp_3072_to_bin_bswap_48(r, a);
     }
 }
 
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_48(a)
+
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_3072_norm_48(a)
+
 extern void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b);
 extern void sp_3072_sqr_12(sp_digit* r, const sp_digit* a);
 extern void sp_3072_mul_avx2_12(sp_digit* r, const sp_digit* a, const sp_digit* b);
@@ -2695,7 +2694,7 @@ static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m)
 #endif
 }
 
-extern int64_t sp_3072_cmp_24(const sp_digit* a, const sp_digit* b);
+extern sp_int64 sp_3072_cmp_24(const sp_digit* a, const sp_digit* b);
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -3279,7 +3278,7 @@ static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m)
 #endif
 }
 
-extern int64_t sp_3072_cmp_48(const sp_digit* a, const sp_digit* b);
+extern sp_int64 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b);
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -3450,9 +3449,9 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td;
 #else
-    sp_digit td[(33 * 96) + 96];
+    sp_digit td[(17 * 96) + 96];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm;
     sp_digit mp = 1;
@@ -3464,7 +3463,7 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (33 * 96) + 96, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (17 * 96) + 96, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL) {
         err = MEMORY_E;
@@ -3474,13 +3473,13 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 96;
-        rt = td + 3072;
+        rt = td + 1536;
 #else
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = &td[i * 96];
-        rt = &td[3072];
+        rt = &td[1536];
 #endif
 
         sp_3072_mont_setup(m, &mp);
@@ -3513,22 +3512,6 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_3072_mont_mul_48(t[13], t[ 7], t[ 6], m, mp);
         sp_3072_mont_sqr_48(t[14], t[ 7], m, mp);
         sp_3072_mont_mul_48(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_48(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_48(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_48(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_48(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_48(t[20], t[10], m, mp);
-        sp_3072_mont_mul_48(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_48(t[22], t[11], m, mp);
-        sp_3072_mont_mul_48(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_48(t[24], t[12], m, mp);
-        sp_3072_mont_mul_48(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_48(t[26], t[13], m, mp);
-        sp_3072_mont_mul_48(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_48(t[28], t[14], m, mp);
-        sp_3072_mont_mul_48(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_48(t[30], t[15], m, mp);
-        sp_3072_mont_mul_48(t[31], t[16], t[15], m, mp);
 
         i = (bits - 1) / 64;
         n = e[i--];
@@ -3537,12 +3520,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
         if (c == 0) {
             c = 64;
         }
-        /* Minus the number of top bits to use so rest is a multiple of 5. */
-        if ((bits % 5) == 0) {
-            c -= 5;
+        /* Minus the number of top bits to use so rest is a multiple of 4. */
+        if ((bits % 4) == 0) {
+            c -= 4;
         }
         else {
-            c -= bits % 5;
+            c -= bits % 4;
         }
         if (c < 0) {
             /* Number of bits in top word is less than number needed. */
@@ -3558,22 +3541,22 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
             n <<= 64 - c;
         }
         XMEMCPY(r, t[y], sizeof(sp_digit) * 48);
-        for (; i>=0 || c>=5; ) {
-            if (c >= 5) {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
+        for (; i>=0 || c>=4; ) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
             }
             else if (c == 0) {
                 n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
+                y = (byte)(n >> 60);
+                n <<= 4;
+                c = 60;
             }
             else {
-                y = (byte)(n >> 59);
+                y = (byte)(n >> 60);
                 n = e[i--];
-                c = 5 - c;
+                c = 4 - c;
                 y |= (byte)(n >> (64 - c));
                 n <<= c;
                 c = 64 - c;
@@ -3587,10 +3570,8 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e,
             sp_3072_mont_reduce_48(rt, m, mp);
             sp_3072_sqr_48(r, rt);
             sp_3072_mont_reduce_48(r, m, mp);
-            sp_3072_sqr_48(rt, r);
-            sp_3072_mont_reduce_48(rt, m, mp);
 
-            sp_3072_mul_48(r, rt, t[y]);
+            sp_3072_mul_48(r, r, t[y]);
             sp_3072_mont_reduce_48(r, m, mp);
         }
 
@@ -3662,9 +3643,9 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td;
 #else
-    sp_digit td[(33 * 96) + 96];
+    sp_digit td[(17 * 96) + 96];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm;
     sp_digit mp = 1;
@@ -3676,7 +3657,7 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (33 * 96) + 96, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (17 * 96) + 96, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL) {
         err = MEMORY_E;
@@ -3686,13 +3667,13 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi
     if (err == MP_OKAY) {
         norm = td;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 96;
-        rt = td + 3072;
+        rt = td + 1536;
 #else
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = &td[i * 96];
-        rt = &td[3072];
+        rt = &td[1536];
 #endif
 
         sp_3072_mont_setup(m, &mp);
@@ -3725,22 +3706,6 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi
         sp_3072_mont_mul_avx2_48(t[13], t[ 7], t[ 6], m, mp);
         sp_3072_mont_sqr_avx2_48(t[14], t[ 7], m, mp);
         sp_3072_mont_mul_avx2_48(t[15], t[ 8], t[ 7], m, mp);
-        sp_3072_mont_sqr_avx2_48(t[16], t[ 8], m, mp);
-        sp_3072_mont_mul_avx2_48(t[17], t[ 9], t[ 8], m, mp);
-        sp_3072_mont_sqr_avx2_48(t[18], t[ 9], m, mp);
-        sp_3072_mont_mul_avx2_48(t[19], t[10], t[ 9], m, mp);
-        sp_3072_mont_sqr_avx2_48(t[20], t[10], m, mp);
-        sp_3072_mont_mul_avx2_48(t[21], t[11], t[10], m, mp);
-        sp_3072_mont_sqr_avx2_48(t[22], t[11], m, mp);
-        sp_3072_mont_mul_avx2_48(t[23], t[12], t[11], m, mp);
-        sp_3072_mont_sqr_avx2_48(t[24], t[12], m, mp);
-        sp_3072_mont_mul_avx2_48(t[25], t[13], t[12], m, mp);
-        sp_3072_mont_sqr_avx2_48(t[26], t[13], m, mp);
-        sp_3072_mont_mul_avx2_48(t[27], t[14], t[13], m, mp);
-        sp_3072_mont_sqr_avx2_48(t[28], t[14], m, mp);
-        sp_3072_mont_mul_avx2_48(t[29], t[15], t[14], m, mp);
-        sp_3072_mont_sqr_avx2_48(t[30], t[15], m, mp);
-        sp_3072_mont_mul_avx2_48(t[31], t[16], t[15], m, mp);
 
         i = (bits - 1) / 64;
         n = e[i--];
@@ -3749,12 +3714,12 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi
         if (c == 0) {
             c = 64;
         }
-        /* Minus the number of top bits to use so rest is a multiple of 5. */
-        if ((bits % 5) == 0) {
-            c -= 5;
+        /* Minus the number of top bits to use so rest is a multiple of 4. */
+        if ((bits % 4) == 0) {
+            c -= 4;
         }
         else {
-            c -= bits % 5;
+            c -= bits % 4;
         }
         if (c < 0) {
             /* Number of bits in top word is less than number needed. */
@@ -3770,22 +3735,22 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi
             n <<= 64 - c;
         }
         XMEMCPY(r, t[y], sizeof(sp_digit) * 48);
-        for (; i>=0 || c>=5; ) {
-            if (c >= 5) {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
+        for (; i>=0 || c>=4; ) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
             }
             else if (c == 0) {
                 n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
+                y = (byte)(n >> 60);
+                n <<= 4;
+                c = 60;
             }
             else {
-                y = (byte)(n >> 59);
+                y = (byte)(n >> 60);
                 n = e[i--];
-                c = 5 - c;
+                c = 4 - c;
                 y |= (byte)(n >> (64 - c));
                 n <<= c;
                 c = 64 - c;
@@ -3799,10 +3764,8 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi
             sp_3072_mont_reduce_avx2_48(rt, m, mp);
             sp_3072_sqr_avx2_48(r, rt);
             sp_3072_mont_reduce_avx2_48(r, m, mp);
-            sp_3072_sqr_avx2_48(rt, r);
-            sp_3072_mont_reduce_avx2_48(rt, m, mp);
 
-            sp_3072_mul_avx2_48(r, rt, t[y]);
+            sp_3072_mul_avx2_48(r, r, t[y]);
             sp_3072_mont_reduce_avx2_48(r, m, mp);
         }
 
@@ -3970,7 +3933,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_48(r, out);
         *outLen = 384;
     }
 
@@ -4061,7 +4024,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_48(r, out);
         *outLen = 384;
     }
 
@@ -4214,7 +4177,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[24], 0, sizeof(sp_digit) * 24);
         sp_3072_add_48(r, tmpb, tmpa);
 
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_48(r, out);
         *outLen = 384;
     }
 
@@ -4669,7 +4632,7 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_3072_to_bin(r, out);
+        sp_3072_to_bin_48(r, out);
         *outLen = 384;
         for (i=0; i<384 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -4850,29 +4813,41 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
 #endif
 }
 
-extern void sp_4096_to_bin_bswap(sp_digit* r, byte* a);
-extern void sp_4096_to_bin_movbe(sp_digit* r, byte* a);
+extern void sp_4096_to_bin_bswap_64(sp_digit* r, byte* a);
+extern void sp_4096_to_bin_movbe_64(sp_digit* r, byte* a);
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 512
  *
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_4096_to_bin(sp_digit* r, byte* a)
+static void sp_4096_to_bin_64(sp_digit* r, byte* a)
 {
 #ifndef NO_MOVBE_SUPPORT
     word32 cpuid_flags = cpuid_get_flags();
 
     if (IS_INTEL_MOVBE(cpuid_flags)) {
-        sp_4096_to_bin_movbe(r, a);
+        sp_4096_to_bin_movbe_64(r, a);
     }
     else
 #endif
     {
-        sp_4096_to_bin_bswap(r, a);
+        sp_4096_to_bin_bswap_64(r, a);
     }
 }
 
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_64(a)
+
+/* Normalize the values in each word to 64.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+#define sp_4096_norm_64(a)
+
 extern sp_digit sp_4096_sub_in_place_64(sp_digit* a, const sp_digit* b);
 extern sp_digit sp_4096_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b);
 extern void sp_4096_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b);
@@ -5028,7 +5003,7 @@ static void sp_4096_mask_64(sp_digit* r, const sp_digit* a, sp_digit m)
 #endif
 }
 
-extern int64_t sp_4096_cmp_64(const sp_digit* a, const sp_digit* b);
+extern sp_int64 sp_4096_cmp_64(const sp_digit* a, const sp_digit* b);
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -5199,9 +5174,9 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td;
 #else
-    sp_digit td[(33 * 128) + 128];
+    sp_digit td[(17 * 128) + 128];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm;
     sp_digit mp = 1;
@@ -5213,7 +5188,7 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (33 * 128) + 128, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (17 * 128) + 128, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL) {
         err = MEMORY_E;
@@ -5223,13 +5198,13 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
     if (err == MP_OKAY) {
         norm = td;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 128;
-        rt = td + 4096;
+        rt = td + 2048;
 #else
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = &td[i * 128];
-        rt = &td[4096];
+        rt = &td[2048];
 #endif
 
         sp_4096_mont_setup(m, &mp);
@@ -5262,22 +5237,6 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         sp_4096_mont_mul_64(t[13], t[ 7], t[ 6], m, mp);
         sp_4096_mont_sqr_64(t[14], t[ 7], m, mp);
         sp_4096_mont_mul_64(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_64(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_64(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_64(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_64(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_64(t[20], t[10], m, mp);
-        sp_4096_mont_mul_64(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_64(t[22], t[11], m, mp);
-        sp_4096_mont_mul_64(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_64(t[24], t[12], m, mp);
-        sp_4096_mont_mul_64(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_64(t[26], t[13], m, mp);
-        sp_4096_mont_mul_64(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_64(t[28], t[14], m, mp);
-        sp_4096_mont_mul_64(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_64(t[30], t[15], m, mp);
-        sp_4096_mont_mul_64(t[31], t[16], t[15], m, mp);
 
         i = (bits - 1) / 64;
         n = e[i--];
@@ -5286,12 +5245,12 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
         if (c == 0) {
             c = 64;
         }
-        /* Minus the number of top bits to use so rest is a multiple of 5. */
-        if ((bits % 5) == 0) {
-            c -= 5;
+        /* Minus the number of top bits to use so rest is a multiple of 4. */
+        if ((bits % 4) == 0) {
+            c -= 4;
         }
         else {
-            c -= bits % 5;
+            c -= bits % 4;
         }
         if (c < 0) {
             /* Number of bits in top word is less than number needed. */
@@ -5307,22 +5266,22 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
             n <<= 64 - c;
         }
         XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
-        for (; i>=0 || c>=5; ) {
-            if (c >= 5) {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
+        for (; i>=0 || c>=4; ) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
             }
             else if (c == 0) {
                 n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
+                y = (byte)(n >> 60);
+                n <<= 4;
+                c = 60;
             }
             else {
-                y = (byte)(n >> 59);
+                y = (byte)(n >> 60);
                 n = e[i--];
-                c = 5 - c;
+                c = 4 - c;
                 y |= (byte)(n >> (64 - c));
                 n <<= c;
                 c = 64 - c;
@@ -5336,10 +5295,8 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
             sp_4096_mont_reduce_64(rt, m, mp);
             sp_4096_sqr_64(r, rt);
             sp_4096_mont_reduce_64(r, m, mp);
-            sp_4096_sqr_64(rt, r);
-            sp_4096_mont_reduce_64(rt, m, mp);
 
-            sp_4096_mul_64(r, rt, t[y]);
+            sp_4096_mul_64(r, r, t[y]);
             sp_4096_mont_reduce_64(r, m, mp);
         }
 
@@ -5411,9 +5368,9 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     sp_digit* td;
 #else
-    sp_digit td[(33 * 128) + 128];
+    sp_digit td[(17 * 128) + 128];
 #endif
-    sp_digit* t[32];
+    sp_digit* t[16];
     sp_digit* rt = NULL;
     sp_digit* norm;
     sp_digit mp = 1;
@@ -5425,7 +5382,7 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (33 * 128) + 128, NULL,
+    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (17 * 128) + 128, NULL,
                             DYNAMIC_TYPE_TMP_BUFFER);
     if (td == NULL) {
         err = MEMORY_E;
@@ -5435,13 +5392,13 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi
     if (err == MP_OKAY) {
         norm = td;
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = td + i * 128;
-        rt = td + 4096;
+        rt = td + 2048;
 #else
-        for (i=0; i<32; i++)
+        for (i=0; i<16; i++)
             t[i] = &td[i * 128];
-        rt = &td[4096];
+        rt = &td[2048];
 #endif
 
         sp_4096_mont_setup(m, &mp);
@@ -5474,22 +5431,6 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi
         sp_4096_mont_mul_avx2_64(t[13], t[ 7], t[ 6], m, mp);
         sp_4096_mont_sqr_avx2_64(t[14], t[ 7], m, mp);
         sp_4096_mont_mul_avx2_64(t[15], t[ 8], t[ 7], m, mp);
-        sp_4096_mont_sqr_avx2_64(t[16], t[ 8], m, mp);
-        sp_4096_mont_mul_avx2_64(t[17], t[ 9], t[ 8], m, mp);
-        sp_4096_mont_sqr_avx2_64(t[18], t[ 9], m, mp);
-        sp_4096_mont_mul_avx2_64(t[19], t[10], t[ 9], m, mp);
-        sp_4096_mont_sqr_avx2_64(t[20], t[10], m, mp);
-        sp_4096_mont_mul_avx2_64(t[21], t[11], t[10], m, mp);
-        sp_4096_mont_sqr_avx2_64(t[22], t[11], m, mp);
-        sp_4096_mont_mul_avx2_64(t[23], t[12], t[11], m, mp);
-        sp_4096_mont_sqr_avx2_64(t[24], t[12], m, mp);
-        sp_4096_mont_mul_avx2_64(t[25], t[13], t[12], m, mp);
-        sp_4096_mont_sqr_avx2_64(t[26], t[13], m, mp);
-        sp_4096_mont_mul_avx2_64(t[27], t[14], t[13], m, mp);
-        sp_4096_mont_sqr_avx2_64(t[28], t[14], m, mp);
-        sp_4096_mont_mul_avx2_64(t[29], t[15], t[14], m, mp);
-        sp_4096_mont_sqr_avx2_64(t[30], t[15], m, mp);
-        sp_4096_mont_mul_avx2_64(t[31], t[16], t[15], m, mp);
 
         i = (bits - 1) / 64;
         n = e[i--];
@@ -5498,12 +5439,12 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi
         if (c == 0) {
             c = 64;
         }
-        /* Minus the number of top bits to use so rest is a multiple of 5. */
-        if ((bits % 5) == 0) {
-            c -= 5;
+        /* Minus the number of top bits to use so rest is a multiple of 4. */
+        if ((bits % 4) == 0) {
+            c -= 4;
         }
         else {
-            c -= bits % 5;
+            c -= bits % 4;
         }
         if (c < 0) {
             /* Number of bits in top word is less than number needed. */
@@ -5519,22 +5460,22 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi
             n <<= 64 - c;
         }
         XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
-        for (; i>=0 || c>=5; ) {
-            if (c >= 5) {
-                y = (byte)((n >> 59) & 0x1f);
-                n <<= 5;
-                c -= 5;
+        for (; i>=0 || c>=4; ) {
+            if (c >= 4) {
+                y = (byte)((n >> 60) & 0xf);
+                n <<= 4;
+                c -= 4;
             }
             else if (c == 0) {
                 n = e[i--];
-                y = (byte)(n >> 59);
-                n <<= 5;
-                c = 59;
+                y = (byte)(n >> 60);
+                n <<= 4;
+                c = 60;
             }
             else {
-                y = (byte)(n >> 59);
+                y = (byte)(n >> 60);
                 n = e[i--];
-                c = 5 - c;
+                c = 4 - c;
                 y |= (byte)(n >> (64 - c));
                 n <<= c;
                 c = 64 - c;
@@ -5548,10 +5489,8 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi
             sp_4096_mont_reduce_avx2_64(rt, m, mp);
             sp_4096_sqr_avx2_64(r, rt);
             sp_4096_mont_reduce_avx2_64(r, m, mp);
-            sp_4096_sqr_avx2_64(rt, r);
-            sp_4096_mont_reduce_avx2_64(rt, m, mp);
 
-            sp_4096_mul_avx2_64(r, rt, t[y]);
+            sp_4096_mul_avx2_64(r, r, t[y]);
             sp_4096_mont_reduce_avx2_64(r, m, mp);
         }
 
@@ -5719,7 +5658,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_64(r, out);
         *outLen = 512;
     }
 
@@ -5810,7 +5749,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_64(r, out);
         *outLen = 512;
     }
 
@@ -5963,7 +5902,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
         XMEMSET(&tmpb[32], 0, sizeof(sp_digit) * 32);
         sp_4096_add_64(r, tmpb, tmpa);
 
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_64(r, out);
         *outLen = 512;
     }
 
@@ -6418,7 +6357,7 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
     }
 
     if (err == MP_OKAY) {
-        sp_4096_to_bin(r, out);
+        sp_4096_to_bin_64(r, out);
         *outLen = 512;
         for (i=0; i<512 && out[i] == 0; i++) {
             /* Search for first non-zero. */
@@ -6882,7 +6821,7 @@ static void sp_256_mont_inv_4(sp_digit* r, const sp_digit* a, sp_digit* td)
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-extern int64_t sp_256_cmp_4(const sp_digit* a, const sp_digit* b);
+extern sp_int64 sp_256_cmp_4(const sp_digit* a, const sp_digit* b);
 /* Normalize the values in each word to 64.
  *
  * a  Array of sp_digit to normalize.
@@ -6904,7 +6843,7 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*4;
-    int64_t n;
+    sp_int64 n;
 
     sp_256_mont_inv_4(t1, p->z, t + 2*4);
 
@@ -8013,7 +7952,7 @@ static void sp_256_map_avx2_4(sp_point_256* r, const sp_point_256* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*4;
-    int64_t n;
+    sp_int64 n;
 
     sp_256_mont_inv_avx2_4(t1, p->z, t + 2*4);
 
@@ -22923,26 +22862,26 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
 }
 
 #ifdef HAVE_ECC_DHE
-extern void sp_256_to_bin_bswap(sp_digit* r, byte* a);
-extern void sp_256_to_bin_movbe(sp_digit* r, byte* a);
+extern void sp_256_to_bin_bswap_4(sp_digit* r, byte* a);
+extern void sp_256_to_bin_movbe_4(sp_digit* r, byte* a);
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 32
  *
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_256_to_bin(sp_digit* r, byte* a)
+static void sp_256_to_bin_4(sp_digit* r, byte* a)
 {
 #ifndef NO_MOVBE_SUPPORT
     word32 cpuid_flags = cpuid_get_flags();
 
     if (IS_INTEL_MOVBE(cpuid_flags)) {
-        sp_256_to_bin_movbe(r, a);
+        sp_256_to_bin_movbe_4(r, a);
     }
     else
 #endif
     {
-        sp_256_to_bin_bswap(r, a);
+        sp_256_to_bin_bswap_4(r, a);
     }
 }
 
@@ -23003,7 +22942,7 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_256_ecc_mulmod_4(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_256_to_bin(point->x, out);
+        sp_256_to_bin_4(point->x, out);
         *outLen = 32;
     }
 
@@ -23564,7 +23503,7 @@ static int sp_256_calc_s_4(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int64_t c;
+    sp_int64 c;
     sp_digit* kInv = k;
 #ifdef HAVE_INTEL_AVX2
     word32 cpuid_flags = cpuid_get_flags();
@@ -23699,7 +23638,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int64_t c;
+        sp_int64 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 4U);
         sp_256_norm_4(ctx->r);
@@ -23748,7 +23687,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int64_t c;
+        sp_int64 c;
         sp_256_norm_4(ctx->x);
         carry = sp_256_add_4(ctx->s, ctx->e, ctx->x);
         sp_256_cond_sub_4(ctx->s, ctx->s,
@@ -23818,7 +23757,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int64_t c;
+    sp_int64 c;
     int err = MP_OKAY;
     int i;
 #ifdef HAVE_INTEL_AVX2
@@ -24206,7 +24145,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int64_t c = 0;
+        sp_int64 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_256_cmp_4(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -24261,7 +24200,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_256* p2 = NULL;
     sp_digit carry;
-    int64_t c = 0;
+    sp_int64 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -25521,7 +25460,7 @@ static void sp_384_mont_inv_6(sp_digit* r, const sp_digit* a, sp_digit* td)
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-extern int64_t sp_384_cmp_6(const sp_digit* a, const sp_digit* b);
+extern sp_int64 sp_384_cmp_6(const sp_digit* a, const sp_digit* b);
 /* Normalize the values in each word to 64.
  *
  * a  Array of sp_digit to normalize.
@@ -25539,7 +25478,7 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*6;
-    int64_t n;
+    sp_int64 n;
 
     sp_384_mont_inv_6(t1, p->z, t + 2*6);
 
@@ -26757,7 +26696,7 @@ static void sp_384_map_avx2_6(sp_point_384* r, const sp_point_384* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*6;
-    int64_t n;
+    sp_int64 n;
 
     sp_384_mont_inv_avx2_6(t1, p->z, t + 2*6);
 
@@ -47481,26 +47420,26 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
 }
 
 #ifdef HAVE_ECC_DHE
-extern void sp_384_to_bin_bswap(sp_digit* r, byte* a);
-extern void sp_384_to_bin_movbe(sp_digit* r, byte* a);
+extern void sp_384_to_bin_bswap_6(sp_digit* r, byte* a);
+extern void sp_384_to_bin_movbe_6(sp_digit* r, byte* a);
 /* Write r as big endian to byte array.
  * Fixed length number of bytes written: 48
  *
  * r  A single precision integer.
  * a  Byte array.
  */
-static void sp_384_to_bin(sp_digit* r, byte* a)
+static void sp_384_to_bin_6(sp_digit* r, byte* a)
 {
 #ifndef NO_MOVBE_SUPPORT
     word32 cpuid_flags = cpuid_get_flags();
 
     if (IS_INTEL_MOVBE(cpuid_flags)) {
-        sp_384_to_bin_movbe(r, a);
+        sp_384_to_bin_movbe_6(r, a);
     }
     else
 #endif
     {
-        sp_384_to_bin_bswap(r, a);
+        sp_384_to_bin_bswap_6(r, a);
     }
 }
 
@@ -47561,7 +47500,7 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out,
             err = sp_384_ecc_mulmod_6(point, point, k, 1, 1, heap);
     }
     if (err == MP_OKAY) {
-        sp_384_to_bin(point->x, out);
+        sp_384_to_bin_6(point->x, out);
         *outLen = 48;
     }
 
@@ -48069,7 +48008,7 @@ static int sp_384_calc_s_6(sp_digit* s, const sp_digit* r, sp_digit* k,
 {
     int err;
     sp_digit carry;
-    int64_t c;
+    sp_int64 c;
     sp_digit* kInv = k;
 #ifdef HAVE_INTEL_AVX2
     word32 cpuid_flags = cpuid_get_flags();
@@ -48204,7 +48143,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
         break;
     case 3: /* MODORDER */
     {
-        int64_t c;
+        sp_int64 c;
         /* r = point->x mod order */
         XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 6U);
         sp_384_norm_6(ctx->r);
@@ -48253,7 +48192,7 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W
     case 9: /* S2 */
     {
         sp_digit carry;
-        int64_t c;
+        sp_int64 c;
         sp_384_norm_6(ctx->x);
         carry = sp_384_add_6(ctx->s, ctx->e, ctx->x);
         sp_384_cond_sub_6(ctx->s, ctx->s,
@@ -48323,7 +48262,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
     sp_digit* r = NULL;
     sp_digit* tmp = NULL;
     sp_digit* s = NULL;
-    int64_t c;
+    sp_int64 c;
     int err = MP_OKAY;
     int i;
 #ifdef HAVE_INTEL_AVX2
@@ -48788,7 +48727,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
         break;
     case 12: /* RES */
     {
-        int64_t c = 0;
+        sp_int64 c = 0;
         err = MP_OKAY; /* math okay, now check result */
         *res = (int)(sp_384_cmp_6(ctx->p1.x, ctx->u1) == 0);
         if (*res == 0) {
@@ -48843,7 +48782,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX,
     sp_digit* tmp = NULL;
     sp_point_384* p2 = NULL;
     sp_digit carry;
-    int64_t c = 0;
+    sp_int64 c = 0;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
@@ -49791,7 +49730,7 @@ static void sp_1024_mask_16(sp_digit* r, const sp_digit* a, sp_digit m)
 #endif
 }
 
-extern int64_t sp_1024_cmp_16(const sp_digit* a, const sp_digit* b);
+extern sp_int64 sp_1024_cmp_16(const sp_digit* a, const sp_digit* b);
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -50254,7 +50193,7 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*16;
-    int64_t n;
+    sp_int64 n;
 
     sp_1024_mont_inv_16(t1, p->z, t + 2*16);
 
@@ -51353,7 +51292,7 @@ static void sp_1024_map_avx2_16(sp_point_1024* r, const sp_point_1024* p,
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*16;
-    int64_t n;
+    sp_int64 n;
 
     sp_1024_mont_inv_avx2_16(t1, p->z, t + 2*16);
 
@@ -61745,7 +61684,7 @@ static int sp_1024_ecc_is_point_16(const sp_point_1024* point,
     sp_digit t1[16 * 4];
 #endif
     sp_digit* t2 = NULL;
-    int64_t n;
+    sp_int64 n;
     int err = MP_OKAY;
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S
index b9cd89783..3fc9b3365 100644
--- a/wolfcrypt/src/sp_x86_64_asm.S
+++ b/wolfcrypt/src/sp_x86_64_asm.S
@@ -219,15 +219,15 @@ L_2048_from_bin_movbe_zero_end:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_2048_to_bin_bswap
-.type	sp_2048_to_bin_bswap,@function
+.globl	sp_2048_to_bin_bswap_32
+.type	sp_2048_to_bin_bswap_32,@function
 .align	16
-sp_2048_to_bin_bswap:
+sp_2048_to_bin_bswap_32:
 #else
 .section	__TEXT,__text
-.globl	_sp_2048_to_bin_bswap
+.globl	_sp_2048_to_bin_bswap_32
 .p2align	4
-_sp_2048_to_bin_bswap:
+_sp_2048_to_bin_bswap_32:
 #endif /* __APPLE__ */
         movq	248(%rdi), %rdx
         movq	240(%rdi), %rax
@@ -327,7 +327,7 @@ _sp_2048_to_bin_bswap:
         movq	%rax, 248(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_2048_to_bin_bswap,.-sp_2048_to_bin_bswap
+.size	sp_2048_to_bin_bswap_32,.-sp_2048_to_bin_bswap_32
 #endif /* __APPLE__ */
 #ifndef NO_MOVBE_SUPPORT
 /* Write r as big endian to byte array.
@@ -339,15 +339,15 @@ _sp_2048_to_bin_bswap:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_2048_to_bin_movbe
-.type	sp_2048_to_bin_movbe,@function
+.globl	sp_2048_to_bin_movbe_32
+.type	sp_2048_to_bin_movbe_32,@function
 .align	16
-sp_2048_to_bin_movbe:
+sp_2048_to_bin_movbe_32:
 #else
 .section	__TEXT,__text
-.globl	_sp_2048_to_bin_movbe
+.globl	_sp_2048_to_bin_movbe_32
 .p2align	4
-_sp_2048_to_bin_movbe:
+_sp_2048_to_bin_movbe_32:
 #endif /* __APPLE__ */
         movbeq	248(%rdi), %rdx
         movbeq	240(%rdi), %rax
@@ -415,7 +415,7 @@ _sp_2048_to_bin_movbe:
         movq	%rax, 248(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_2048_to_bin_movbe,.-sp_2048_to_bin_movbe
+.size	sp_2048_to_bin_movbe_32,.-sp_2048_to_bin_movbe_32
 #endif /* __APPLE__ */
 #endif /* NO_MOVBE_SUPPORT */
 /* Multiply a and b into r. (r = a * b)
@@ -12889,15 +12889,15 @@ L_3072_from_bin_movbe_zero_end:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_3072_to_bin_bswap
-.type	sp_3072_to_bin_bswap,@function
+.globl	sp_3072_to_bin_bswap_48
+.type	sp_3072_to_bin_bswap_48,@function
 .align	16
-sp_3072_to_bin_bswap:
+sp_3072_to_bin_bswap_48:
 #else
 .section	__TEXT,__text
-.globl	_sp_3072_to_bin_bswap
+.globl	_sp_3072_to_bin_bswap_48
 .p2align	4
-_sp_3072_to_bin_bswap:
+_sp_3072_to_bin_bswap_48:
 #endif /* __APPLE__ */
         movq	376(%rdi), %rdx
         movq	368(%rdi), %rax
@@ -13045,7 +13045,7 @@ _sp_3072_to_bin_bswap:
         movq	%rax, 376(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_3072_to_bin_bswap,.-sp_3072_to_bin_bswap
+.size	sp_3072_to_bin_bswap_48,.-sp_3072_to_bin_bswap_48
 #endif /* __APPLE__ */
 #ifndef NO_MOVBE_SUPPORT
 /* Write r as big endian to byte array.
@@ -13057,15 +13057,15 @@ _sp_3072_to_bin_bswap:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_3072_to_bin_movbe
-.type	sp_3072_to_bin_movbe,@function
+.globl	sp_3072_to_bin_movbe_48
+.type	sp_3072_to_bin_movbe_48,@function
 .align	16
-sp_3072_to_bin_movbe:
+sp_3072_to_bin_movbe_48:
 #else
 .section	__TEXT,__text
-.globl	_sp_3072_to_bin_movbe
+.globl	_sp_3072_to_bin_movbe_48
 .p2align	4
-_sp_3072_to_bin_movbe:
+_sp_3072_to_bin_movbe_48:
 #endif /* __APPLE__ */
         movbeq	376(%rdi), %rdx
         movbeq	368(%rdi), %rax
@@ -13165,7 +13165,7 @@ _sp_3072_to_bin_movbe:
         movq	%rax, 376(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_3072_to_bin_movbe,.-sp_3072_to_bin_movbe
+.size	sp_3072_to_bin_movbe_48,.-sp_3072_to_bin_movbe_48
 #endif /* __APPLE__ */
 #endif /* NO_MOVBE_SUPPORT */
 /* Multiply a and b into r. (r = a * b)
@@ -28258,15 +28258,15 @@ L_4096_from_bin_movbe_zero_end:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_4096_to_bin_bswap
-.type	sp_4096_to_bin_bswap,@function
+.globl	sp_4096_to_bin_bswap_64
+.type	sp_4096_to_bin_bswap_64,@function
 .align	16
-sp_4096_to_bin_bswap:
+sp_4096_to_bin_bswap_64:
 #else
 .section	__TEXT,__text
-.globl	_sp_4096_to_bin_bswap
+.globl	_sp_4096_to_bin_bswap_64
 .p2align	4
-_sp_4096_to_bin_bswap:
+_sp_4096_to_bin_bswap_64:
 #endif /* __APPLE__ */
         movq	504(%rdi), %rdx
         movq	496(%rdi), %rax
@@ -28462,7 +28462,7 @@ _sp_4096_to_bin_bswap:
         movq	%rax, 504(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_4096_to_bin_bswap,.-sp_4096_to_bin_bswap
+.size	sp_4096_to_bin_bswap_64,.-sp_4096_to_bin_bswap_64
 #endif /* __APPLE__ */
 #ifndef NO_MOVBE_SUPPORT
 /* Write r as big endian to byte array.
@@ -28474,15 +28474,15 @@ _sp_4096_to_bin_bswap:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_4096_to_bin_movbe
-.type	sp_4096_to_bin_movbe,@function
+.globl	sp_4096_to_bin_movbe_64
+.type	sp_4096_to_bin_movbe_64,@function
 .align	16
-sp_4096_to_bin_movbe:
+sp_4096_to_bin_movbe_64:
 #else
 .section	__TEXT,__text
-.globl	_sp_4096_to_bin_movbe
+.globl	_sp_4096_to_bin_movbe_64
 .p2align	4
-_sp_4096_to_bin_movbe:
+_sp_4096_to_bin_movbe_64:
 #endif /* __APPLE__ */
         movbeq	504(%rdi), %rdx
         movbeq	496(%rdi), %rax
@@ -28614,7 +28614,7 @@ _sp_4096_to_bin_movbe:
         movq	%rax, 504(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_4096_to_bin_movbe,.-sp_4096_to_bin_movbe
+.size	sp_4096_to_bin_movbe_64,.-sp_4096_to_bin_movbe_64
 #endif /* __APPLE__ */
 #endif /* NO_MOVBE_SUPPORT */
 /* Sub b from a into a. (a -= b)
@@ -41080,15 +41080,15 @@ L_256_from_bin_movbe_zero_end:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_256_to_bin_bswap
-.type	sp_256_to_bin_bswap,@function
+.globl	sp_256_to_bin_bswap_4
+.type	sp_256_to_bin_bswap_4,@function
 .align	16
-sp_256_to_bin_bswap:
+sp_256_to_bin_bswap_4:
 #else
 .section	__TEXT,__text
-.globl	_sp_256_to_bin_bswap
+.globl	_sp_256_to_bin_bswap_4
 .p2align	4
-_sp_256_to_bin_bswap:
+_sp_256_to_bin_bswap_4:
 #endif /* __APPLE__ */
         movq	24(%rdi), %rdx
         movq	16(%rdi), %rax
@@ -41104,7 +41104,7 @@ _sp_256_to_bin_bswap:
         movq	%rax, 24(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_256_to_bin_bswap,.-sp_256_to_bin_bswap
+.size	sp_256_to_bin_bswap_4,.-sp_256_to_bin_bswap_4
 #endif /* __APPLE__ */
 #ifndef NO_MOVBE_SUPPORT
 /* Write r as big endian to byte array.
@@ -41116,15 +41116,15 @@ _sp_256_to_bin_bswap:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_256_to_bin_movbe
-.type	sp_256_to_bin_movbe,@function
+.globl	sp_256_to_bin_movbe_4
+.type	sp_256_to_bin_movbe_4,@function
 .align	16
-sp_256_to_bin_movbe:
+sp_256_to_bin_movbe_4:
 #else
 .section	__TEXT,__text
-.globl	_sp_256_to_bin_movbe
+.globl	_sp_256_to_bin_movbe_4
 .p2align	4
-_sp_256_to_bin_movbe:
+_sp_256_to_bin_movbe_4:
 #endif /* __APPLE__ */
         movbeq	24(%rdi), %rdx
         movbeq	16(%rdi), %rax
@@ -41136,7 +41136,7 @@ _sp_256_to_bin_movbe:
         movq	%rax, 24(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_256_to_bin_movbe,.-sp_256_to_bin_movbe
+.size	sp_256_to_bin_movbe_4,.-sp_256_to_bin_movbe_4
 #endif /* __APPLE__ */
 #endif /* NO_MOVBE_SUPPORT */
 #ifdef HAVE_INTEL_AVX2
@@ -45179,15 +45179,15 @@ L_384_from_bin_movbe_zero_end:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_384_to_bin_bswap
-.type	sp_384_to_bin_bswap,@function
+.globl	sp_384_to_bin_bswap_6
+.type	sp_384_to_bin_bswap_6,@function
 .align	16
-sp_384_to_bin_bswap:
+sp_384_to_bin_bswap_6:
 #else
 .section	__TEXT,__text
-.globl	_sp_384_to_bin_bswap
+.globl	_sp_384_to_bin_bswap_6
 .p2align	4
-_sp_384_to_bin_bswap:
+_sp_384_to_bin_bswap_6:
 #endif /* __APPLE__ */
         movq	40(%rdi), %rdx
         movq	32(%rdi), %rax
@@ -45209,7 +45209,7 @@ _sp_384_to_bin_bswap:
         movq	%rax, 40(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_384_to_bin_bswap,.-sp_384_to_bin_bswap
+.size	sp_384_to_bin_bswap_6,.-sp_384_to_bin_bswap_6
 #endif /* __APPLE__ */
 #ifndef NO_MOVBE_SUPPORT
 /* Write r as big endian to byte array.
@@ -45221,15 +45221,15 @@ _sp_384_to_bin_bswap:
  */
 #ifndef __APPLE__
 .text
-.globl	sp_384_to_bin_movbe
-.type	sp_384_to_bin_movbe,@function
+.globl	sp_384_to_bin_movbe_6
+.type	sp_384_to_bin_movbe_6,@function
 .align	16
-sp_384_to_bin_movbe:
+sp_384_to_bin_movbe_6:
 #else
 .section	__TEXT,__text
-.globl	_sp_384_to_bin_movbe
+.globl	_sp_384_to_bin_movbe_6
 .p2align	4
-_sp_384_to_bin_movbe:
+_sp_384_to_bin_movbe_6:
 #endif /* __APPLE__ */
         movbeq	40(%rdi), %rdx
         movbeq	32(%rdi), %rax
@@ -45245,7 +45245,7 @@ _sp_384_to_bin_movbe:
         movq	%rax, 40(%rsi)
         repz retq
 #ifndef __APPLE__
-.size	sp_384_to_bin_movbe,.-sp_384_to_bin_movbe
+.size	sp_384_to_bin_movbe_6,.-sp_384_to_bin_movbe_6
 #endif /* __APPLE__ */
 #endif /* NO_MOVBE_SUPPORT */
 /* Sub b from a into a. (a -= b)
diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm
index 76e85c902..f56accebd 100644
--- a/wolfcrypt/src/sp_x86_64_asm.asm
+++ b/wolfcrypt/src/sp_x86_64_asm.asm
@@ -217,7 +217,7 @@ ENDIF
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_2048_to_bin_bswap PROC
+sp_2048_to_bin_bswap_32 PROC
         mov	rax, QWORD PTR [rcx+248]
         mov	r8, QWORD PTR [rcx+240]
         bswap	rax
@@ -315,7 +315,7 @@ sp_2048_to_bin_bswap PROC
         mov	QWORD PTR [rdx+240], rax
         mov	QWORD PTR [rdx+248], r8
         ret
-sp_2048_to_bin_bswap ENDP
+sp_2048_to_bin_bswap_32 ENDP
 _text ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
@@ -326,7 +326,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_2048_to_bin_movbe PROC
+sp_2048_to_bin_movbe_32 PROC
         movbe	rax, QWORD PTR [rcx+248]
         movbe	r8, QWORD PTR [rcx+240]
         mov	QWORD PTR [rdx], rax
@@ -392,7 +392,7 @@ sp_2048_to_bin_movbe PROC
         mov	QWORD PTR [rdx+240], rax
         mov	QWORD PTR [rdx+248], r8
         ret
-sp_2048_to_bin_movbe ENDP
+sp_2048_to_bin_movbe_32 ENDP
 _text ENDS
 ENDIF
 ; /* Multiply a and b into r. (r = a * b)
@@ -12502,7 +12502,7 @@ ENDIF
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_3072_to_bin_bswap PROC
+sp_3072_to_bin_bswap_48 PROC
         mov	rax, QWORD PTR [rcx+376]
         mov	r8, QWORD PTR [rcx+368]
         bswap	rax
@@ -12648,7 +12648,7 @@ sp_3072_to_bin_bswap PROC
         mov	QWORD PTR [rdx+368], rax
         mov	QWORD PTR [rdx+376], r8
         ret
-sp_3072_to_bin_bswap ENDP
+sp_3072_to_bin_bswap_48 ENDP
 _text ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
@@ -12659,7 +12659,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_3072_to_bin_movbe PROC
+sp_3072_to_bin_movbe_48 PROC
         movbe	rax, QWORD PTR [rcx+376]
         movbe	r8, QWORD PTR [rcx+368]
         mov	QWORD PTR [rdx], rax
@@ -12757,7 +12757,7 @@ sp_3072_to_bin_movbe PROC
         mov	QWORD PTR [rdx+368], rax
         mov	QWORD PTR [rdx+376], r8
         ret
-sp_3072_to_bin_movbe ENDP
+sp_3072_to_bin_movbe_48 ENDP
 _text ENDS
 ENDIF
 ; /* Multiply a and b into r. (r = a * b)
@@ -27384,7 +27384,7 @@ ENDIF
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_4096_to_bin_bswap PROC
+sp_4096_to_bin_bswap_64 PROC
         mov	rax, QWORD PTR [rcx+504]
         mov	r8, QWORD PTR [rcx+496]
         bswap	rax
@@ -27578,7 +27578,7 @@ sp_4096_to_bin_bswap PROC
         mov	QWORD PTR [rdx+496], rax
         mov	QWORD PTR [rdx+504], r8
         ret
-sp_4096_to_bin_bswap ENDP
+sp_4096_to_bin_bswap_64 ENDP
 _text ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
@@ -27589,7 +27589,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_4096_to_bin_movbe PROC
+sp_4096_to_bin_movbe_64 PROC
         movbe	rax, QWORD PTR [rcx+504]
         movbe	r8, QWORD PTR [rcx+496]
         mov	QWORD PTR [rdx], rax
@@ -27719,7 +27719,7 @@ sp_4096_to_bin_movbe PROC
         mov	QWORD PTR [rdx+496], rax
         mov	QWORD PTR [rdx+504], r8
         ret
-sp_4096_to_bin_movbe ENDP
+sp_4096_to_bin_movbe_64 ENDP
 _text ENDS
 ENDIF
 ; /* Sub b from a into a. (a -= b)
@@ -39716,7 +39716,7 @@ ENDIF
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_256_to_bin_bswap PROC
+sp_256_to_bin_bswap_4 PROC
         mov	rax, QWORD PTR [rcx+24]
         mov	r8, QWORD PTR [rcx+16]
         bswap	rax
@@ -39730,7 +39730,7 @@ sp_256_to_bin_bswap PROC
         mov	QWORD PTR [rdx+16], rax
         mov	QWORD PTR [rdx+24], r8
         ret
-sp_256_to_bin_bswap ENDP
+sp_256_to_bin_bswap_4 ENDP
 _text ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
@@ -39741,7 +39741,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_256_to_bin_movbe PROC
+sp_256_to_bin_movbe_4 PROC
         movbe	rax, QWORD PTR [rcx+24]
         movbe	r8, QWORD PTR [rcx+16]
         mov	QWORD PTR [rdx], rax
@@ -39751,7 +39751,7 @@ sp_256_to_bin_movbe PROC
         mov	QWORD PTR [rdx+16], rax
         mov	QWORD PTR [rdx+24], r8
         ret
-sp_256_to_bin_movbe ENDP
+sp_256_to_bin_movbe_4 ENDP
 _text ENDS
 ENDIF
 IFDEF HAVE_INTEL_AVX2
@@ -43467,7 +43467,7 @@ ENDIF
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_384_to_bin_bswap PROC
+sp_384_to_bin_bswap_6 PROC
         mov	rax, QWORD PTR [rcx+40]
         mov	r8, QWORD PTR [rcx+32]
         bswap	rax
@@ -43487,7 +43487,7 @@ sp_384_to_bin_bswap PROC
         mov	QWORD PTR [rdx+32], rax
         mov	QWORD PTR [rdx+40], r8
         ret
-sp_384_to_bin_bswap ENDP
+sp_384_to_bin_bswap_6 ENDP
 _text ENDS
 IFNDEF NO_MOVBE_SUPPORT
 ; /* Write r as big endian to byte array.
@@ -43498,7 +43498,7 @@ IFNDEF NO_MOVBE_SUPPORT
 ;  * a  Byte array.
 ;  */
 _text SEGMENT READONLY PARA
-sp_384_to_bin_movbe PROC
+sp_384_to_bin_movbe_6 PROC
         movbe	rax, QWORD PTR [rcx+40]
         movbe	r8, QWORD PTR [rcx+32]
         mov	QWORD PTR [rdx], rax
@@ -43512,7 +43512,7 @@ sp_384_to_bin_movbe PROC
         mov	QWORD PTR [rdx+32], rax
         mov	QWORD PTR [rdx+40], r8
         ret
-sp_384_to_bin_movbe ENDP
+sp_384_to_bin_movbe_6 ENDP
 _text ENDS
 ENDIF
 ; /* Sub b from a into a. (a -= b)
diff --git a/wolfssl/wolfcrypt/sp_int.h b/wolfssl/wolfcrypt/sp_int.h
index a48d5ad5d..0024f965f 100644
--- a/wolfssl/wolfcrypt/sp_int.h
+++ b/wolfssl/wolfcrypt/sp_int.h
@@ -314,6 +314,25 @@ extern "C" {
 /* Mask of word size. */
 #define SP_WORD_MASK    (SP_WORD_SIZE - 1)
 
+/* For debugging only - format string for different digit sizes. */
+#if SP_WORD_SIZE == 64
+    #if SP_ULONG_BITS == 64
+        #define SP_PRINT_FMT       "%016lx"
+    #else
+        #define SP_PRINT_FMT       "%016llx"
+    #endif
+#elif SP_WORD_SIZE == 32
+    #if SP_UINT_BITS == 32
+        #define SP_PRINT_FMT       "%08x"
+    #else
+        #define SP_PRINT_FMT       "%08lx"
+    #endif
+#elif SP_WORD_SIZE == 16
+    #define SP_PRINT_FMT       "%04x"
+#elif SP_WORD_SIZE == 8
+    #define SP_PRINT_FMT       "%02x"
+#endif
+
 
 #if defined(WOLFSSL_HAVE_SP_ECC) && defined(WOLFSSL_SP_NONBLOCK)
 /* Non-blocking ECC operation context. */
@@ -417,25 +436,6 @@ typedef struct sp_ecc_ctx {
 #endif
 
 
-/* For debugging only - format string for different digit sizes. */
-#if SP_WORD_SIZE == 64
-    #if SP_ULONG_BITS == 64
-        #define SP_PRINT_FMT       "%016lx"
-    #else
-        #define SP_PRINT_FMT       "%016llx"
-    #endif
-#elif SP_WORD_SIZE == 32
-    #if SP_UINT_BITS == 32
-        #define SP_PRINT_FMT       "%08x"
-    #else
-        #define SP_PRINT_FMT       "%08lx"
-    #endif
-#elif SP_WORD_SIZE == 16
-    #define SP_PRINT_FMT       "%04x"
-#elif SP_WORD_SIZE == 8
-    #define SP_PRINT_FMT       "%02x"
-#endif
-
 #ifndef NO_FILESYSTEM
 /* Output is formatted to be used with script that checks calculations. */