From c4dfa41088f06b0a1c6f03c751cc1d9c9a377f36 Mon Sep 17 00:00:00 2001
From: Sean Parkinson <sean@wolfssl.com>
Date: Tue, 13 Mar 2018 14:16:48 +1000
Subject: [PATCH] SP improvements

Tag functions to not be inlined so llvm doesn't make huge builds.
Add sp_mod to support new DH key generation function.
---
 wolfcrypt/src/sp_arm32.c   |   7 +-
 wolfcrypt/src/sp_arm64.c   |   2 +
 wolfcrypt/src/sp_c32.c     | 255 ++++++++++++++++++++++++++++---------
 wolfcrypt/src/sp_c64.c     | 226 ++++++++++++++++++++++----------
 wolfcrypt/src/sp_int.c     | 113 +++++++++++++++-
 wolfcrypt/src/sp_x86_64.c  |  88 +++++++------
 wolfcrypt/test/test.c      | 109 ++++++++++++++++
 wolfssl/wolfcrypt/sp_int.h |   2 +
 8 files changed, 628 insertions(+), 174 deletions(-)

diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c
index b2cfd84ef..a7a6bc874 100644
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -7326,6 +7326,7 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen,
 
     return err;
 }
+
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_SP_NO_2048 */
@@ -8892,7 +8893,8 @@ static sp_digit sp_3072_add_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit p0[32];
     sp_digit p1[32];
@@ -8940,7 +8942,7 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
 {
     sp_digit p0[32];
     sp_digit p1[32];
@@ -16728,6 +16730,7 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen,
 
     return err;
 }
+
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_SP_NO_3072 */
diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c
index 2af0a6fb2..c2396eba5 100644
--- a/wolfcrypt/src/sp_arm64.c
+++ b/wolfcrypt/src/sp_arm64.c
@@ -5056,6 +5056,7 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen,
 
     return err;
 }
+
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_SP_NO_2048 */
@@ -12036,6 +12037,7 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen,
 
     return err;
 }
+
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_SP_NO_3072 */
diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c
index 4259db97d..e5e8275a6 100644
--- a/wolfcrypt/src/sp_c32.c
+++ b/wolfcrypt/src/sp_c32.c
@@ -48,13 +48,13 @@
 
 #include <wolfssl/wolfcrypt/sp.h>
 
+#ifndef WOLFSSL_SP_ASM
+#if SP_WORD_SIZE == 32
 #if defined(WOLFSSL_SP_CACHE_RESISTANT) || defined(WOLFSSL_SP_SMALL)
 /* Mask for address to obfuscate which of the two address will be used. */
 static const size_t addr_mask[2] = { 0, (size_t)-1 };
 #endif
 
-#ifndef WOLFSSL_SP_ASM
-#if SP_WORD_SIZE == 32
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
 /* Read big endian unsigned byte aray into r.
@@ -197,7 +197,8 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int64_t t0   = ((int64_t)a[ 0]) * b[ 0];
     int64_t t1   = ((int64_t)a[ 0]) * b[ 1]
@@ -306,7 +307,7 @@ static void sp_2048_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a)
 {
     int64_t t0   =  ((int64_t)a[ 0]) * a[ 0];
     int64_t t1   = (((int64_t)a[ 0]) * a[ 1]) * 2;
@@ -380,7 +381,7 @@ static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_9(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_9(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     r[ 0] = a[ 0] + b[ 0];
@@ -402,7 +403,7 @@ static int sp_2048_add_9(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -429,7 +430,7 @@ static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -456,7 +457,8 @@ static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_18(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit* z0 = r;
     sp_digit z1[18];
@@ -478,7 +480,7 @@ static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z1[18];
@@ -499,7 +501,7 @@ static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -528,7 +530,7 @@ static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -557,7 +559,8 @@ static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_54(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit p0[36];
     sp_digit p1[36];
@@ -605,7 +608,7 @@ static void sp_2048_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_54(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_54(sp_digit* r, const sp_digit* a)
 {
     sp_digit p0[36];
     sp_digit p1[36];
@@ -648,7 +651,7 @@ static void sp_2048_sqr_54(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -679,7 +682,7 @@ static int sp_2048_add_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_108(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_108(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -708,7 +711,7 @@ static int sp_2048_add_108(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_108(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_108(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -737,7 +740,8 @@ static int sp_2048_sub_108(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_108(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_108(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit* z0 = r;
     sp_digit z1[108];
@@ -759,7 +763,7 @@ static void sp_2048_mul_108(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_108(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_108(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z1[108];
@@ -782,7 +786,7 @@ static void sp_2048_sqr_108(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_108(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_108(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -800,7 +804,7 @@ static int sp_2048_add_108(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_108(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_108(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -819,7 +823,8 @@ static int sp_2048_sub_108(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_108(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_108(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int64_t c;
@@ -849,7 +854,7 @@ static void sp_2048_mul_108(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_108(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_108(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int64_t c;
@@ -886,7 +891,7 @@ static void sp_2048_sqr_108(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -904,7 +909,7 @@ static int sp_2048_add_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -922,7 +927,7 @@ static int sp_2048_sub_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -955,7 +960,8 @@ static int sp_2048_sub_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_54(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int64_t c;
@@ -985,7 +991,7 @@ static void sp_2048_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_54(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_54(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int64_t c;
@@ -1162,7 +1168,7 @@ static void sp_2048_cond_sub_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_2048_mul_add_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_add_54(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -1353,7 +1359,8 @@ static void sp_2048_mont_sqr_54(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_2048_mul_d_54(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_2048_mul_d_54(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int64_t tb = b;
@@ -1442,6 +1449,24 @@ static void sp_2048_cond_add_54(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+#ifdef WOLFSSL_SMALL
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_add_54(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 54; i++)
+        r[i] = a[i] + b[i];
+
+    return 0;
+}
+#endif
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -1960,7 +1985,7 @@ static void sp_2048_cond_sub_108(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_2048_mul_add_108(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_add_108(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -2156,7 +2181,8 @@ static void sp_2048_mont_sqr_108(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_2048_mul_d_108(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_2048_mul_d_108(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int64_t tb = b;
@@ -2239,6 +2265,43 @@ static void sp_2048_cond_add_108(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+#ifdef WOLFSSL_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_sub_108(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 108; i++)
+        r[i] = a[i] - b[i];
+
+    return 0;
+}
+
+#endif
+#ifdef WOLFSSL_SMALL
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_add_108(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 108; i++)
+        r[i] = a[i] + b[i];
+
+    return 0;
+}
+#endif
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -3412,6 +3475,7 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen,
     return err;
 #endif
 }
+
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_SP_NO_2048 */
@@ -3557,7 +3621,8 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_70(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_70(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j;
     int64_t t[140];
@@ -3579,7 +3644,7 @@ static void sp_3072_mul_70(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_70(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_70(sp_digit* r, const sp_digit* a)
 {
     int i, j;
     int64_t t[140];
@@ -3603,7 +3668,7 @@ static void sp_3072_sqr_70(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3634,7 +3699,7 @@ static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3663,7 +3728,7 @@ static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3692,7 +3757,8 @@ static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_140(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_140(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit* z0 = r;
     sp_digit z1[140];
@@ -3714,7 +3780,7 @@ static void sp_3072_mul_140(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_140(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_140(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z1[140];
@@ -3737,7 +3803,7 @@ static void sp_3072_sqr_140(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3755,7 +3821,7 @@ static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3774,7 +3840,8 @@ static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_140(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_140(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int64_t c;
@@ -3804,7 +3871,7 @@ static void sp_3072_mul_140(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_140(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_140(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int64_t c;
@@ -3841,7 +3908,7 @@ static void sp_3072_sqr_140(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3859,7 +3926,7 @@ static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_70(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_70(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3877,7 +3944,7 @@ static int sp_3072_sub_70(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_70(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_70(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3910,7 +3977,8 @@ static int sp_3072_sub_70(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_70(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_70(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int64_t c;
@@ -3940,7 +4008,7 @@ static void sp_3072_mul_70(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_70(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_70(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int64_t c;
@@ -4117,7 +4185,7 @@ static void sp_3072_cond_sub_70(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_3072_mul_add_70(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_add_70(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -4323,7 +4391,8 @@ static void sp_3072_mont_sqr_70(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_3072_mul_d_70(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_3072_mul_d_70(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int64_t tb = b;
@@ -4412,6 +4481,24 @@ static void sp_3072_cond_add_70(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+#ifdef WOLFSSL_SMALL
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_70(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 70; i++)
+        r[i] = a[i] + b[i];
+
+    return 0;
+}
+#endif
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -4930,7 +5017,7 @@ static void sp_3072_cond_sub_140(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_3072_mul_add_140(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_add_140(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -5139,7 +5226,8 @@ static void sp_3072_mont_sqr_140(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_3072_mul_d_140(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_3072_mul_d_140(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int64_t tb = b;
@@ -5222,6 +5310,43 @@ static void sp_3072_cond_add_140(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+#ifdef WOLFSSL_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_sub_140(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 140; i++)
+        r[i] = a[i] - b[i];
+
+    return 0;
+}
+
+#endif
+#ifdef WOLFSSL_SMALL
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_3072_add_140(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 140; i++)
+        r[i] = a[i] + b[i];
+
+    return 0;
+}
+#endif
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -6395,6 +6520,7 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen,
     return err;
 #endif
 }
+
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_SP_NO_3072 */
@@ -6895,7 +7021,7 @@ static void sp_256_cond_sub_10(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_256_mul_add_10(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_256_mul_add_10(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -7031,7 +7157,8 @@ static void sp_256_mont_reduce_10(sp_digit* a, sp_digit* m, sp_digit mp)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_256_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_256_mul_10(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int64_t c;
@@ -7063,7 +7190,8 @@ static void sp_256_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_256_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_256_mul_10(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int64_t t0   = ((int64_t)a[ 0]) * b[ 0];
     int64_t t1   = ((int64_t)a[ 0]) * b[ 1]
@@ -7211,7 +7339,7 @@ static void sp_256_mont_mul_10(sp_digit* r, sp_digit* a, sp_digit* b,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_sqr_10(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_256_sqr_10(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int64_t c;
@@ -7245,7 +7373,7 @@ static void sp_256_sqr_10(sp_digit* r, const sp_digit* a)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_sqr_10(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_256_sqr_10(sp_digit* r, const sp_digit* a)
 {
     int64_t t0   =  ((int64_t)a[ 0]) * a[ 0];
     int64_t t1   = (((int64_t)a[ 0]) * a[ 1]) * 2;
@@ -7486,7 +7614,7 @@ static void sp_256_map_10(sp_point* r, sp_point* p, sp_digit* t)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_256_add_10(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_256_add_10(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -7503,7 +7631,7 @@ static int sp_256_add_10(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_256_add_10(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_256_add_10(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     r[ 0] = a[ 0] + b[ 0];
@@ -7576,7 +7704,7 @@ static void sp_256_mont_tpl_10(sp_digit* r, sp_digit* a, sp_digit* m)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_256_sub_10(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_256_sub_10(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -7594,7 +7722,7 @@ static int sp_256_sub_10(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_256_sub_10(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_256_sub_10(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     r[ 0] = a[ 0] - b[ 0];
@@ -7663,7 +7791,7 @@ static void sp_256_mont_sub_10(sp_digit* r, sp_digit* a, sp_digit* b,
  * r  Result of shift.
  * a  Number to shift.
  */
-static void sp_256_rshift1_10(sp_digit* r, sp_digit* a)
+SP_NOINLINE static void sp_256_rshift1_10(sp_digit* r, sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
@@ -10374,7 +10502,7 @@ static int sp_256_iszero_10(const sp_digit* a)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_add_one_10(sp_digit* a)
+SP_NOINLINE static void sp_256_add_one_10(sp_digit* a)
 {
     a[0]++;
     sp_256_norm_10(a);
@@ -10639,7 +10767,8 @@ int sp_ecc_secret_gen_256(mp_int* priv, ecc_point* pub, byte* out,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_256_mul_d_10(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_256_mul_d_10(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int64_t tb = b;
diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c
index 038aaac7b..57936a6fc 100644
--- a/wolfcrypt/src/sp_c64.c
+++ b/wolfcrypt/src/sp_c64.c
@@ -48,13 +48,13 @@
 
 #include <wolfssl/wolfcrypt/sp.h>
 
+#ifndef WOLFSSL_SP_ASM
+#if SP_WORD_SIZE == 64
 #if defined(WOLFSSL_SP_CACHE_RESISTANT) || defined(WOLFSSL_SP_SMALL)
 /* Mask for address to obfuscate which of the two address will be used. */
 static const size_t addr_mask[2] = { 0, (size_t)-1 };
 #endif
 
-#ifndef WOLFSSL_SP_ASM
-#if SP_WORD_SIZE == 64
 #if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
 #ifndef WOLFSSL_SP_NO_2048
 /* Read big endian unsigned byte aray into r.
@@ -197,7 +197,8 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
     int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
@@ -306,7 +307,7 @@ static void sp_2048_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a)
 {
     int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
     int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
@@ -380,7 +381,7 @@ static void sp_2048_sqr_9(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_9(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_9(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     r[ 0] = a[ 0] + b[ 0];
@@ -402,7 +403,7 @@ static int sp_2048_add_9(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -429,7 +430,7 @@ static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -456,7 +457,8 @@ static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_18(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit* z0 = r;
     sp_digit z1[18];
@@ -478,7 +480,7 @@ static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z1[18];
@@ -499,7 +501,7 @@ static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -528,7 +530,7 @@ static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -557,7 +559,8 @@ static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_36(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit* z0 = r;
     sp_digit z1[36];
@@ -579,7 +582,7 @@ static void sp_2048_mul_36(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z1[36];
@@ -602,7 +605,7 @@ static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -620,7 +623,7 @@ static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -639,7 +642,8 @@ static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_36(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int128_t c;
@@ -669,7 +673,7 @@ static void sp_2048_mul_36(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int128_t c;
@@ -706,7 +710,7 @@ static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -724,7 +728,7 @@ static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -743,7 +747,8 @@ static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_18(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int128_t c;
@@ -773,7 +778,7 @@ static void sp_2048_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_18(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int128_t c;
@@ -939,7 +944,7 @@ static void sp_2048_cond_sub_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_2048_mul_add_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -1114,7 +1119,8 @@ static void sp_2048_mont_sqr_18(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int128_t tb = b;
@@ -1191,6 +1197,43 @@ static void sp_2048_cond_add_18(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+#ifdef WOLFSSL_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_sub_18(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 18; i++)
+        r[i] = a[i] - b[i];
+
+    return 0;
+}
+
+#endif
+#ifdef WOLFSSL_SMALL
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_add_18(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 18; i++)
+        r[i] = a[i] + b[i];
+
+    return 0;
+}
+#endif
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -1709,7 +1752,7 @@ static void sp_2048_cond_sub_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_2048_mul_add_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -1918,7 +1961,8 @@ static void sp_2048_mont_sqr_36(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int128_t tb = b;
@@ -2001,6 +2045,43 @@ static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
+#ifdef WOLFSSL_SMALL
+/* Sub b from a into r. (r = a - b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 36; i++)
+        r[i] = a[i] - b[i];
+
+    return 0;
+}
+
+#endif
+#ifdef WOLFSSL_SMALL
+/* Add b to a into r. (r = a + b)
+ *
+ * r  A single precision integer.
+ * a  A single precision integer.
+ * b  A single precision integer.
+ */
+SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
+        const sp_digit* b)
+{
+    int i;
+
+    for (i = 0; i < 36; i++)
+        r[i] = a[i] + b[i];
+
+    return 0;
+}
+#endif
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -3170,6 +3251,7 @@ int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen,
     return err;
 #endif
 }
+
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_SP_NO_2048 */
@@ -3315,7 +3397,8 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
     int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
@@ -3424,7 +3507,7 @@ static void sp_3072_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_9(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_9(sp_digit* r, const sp_digit* a)
 {
     int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
     int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
@@ -3498,7 +3581,7 @@ static void sp_3072_sqr_9(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_9(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_9(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     r[ 0] = a[ 0] + b[ 0];
@@ -3520,7 +3603,7 @@ static int sp_3072_add_9(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3547,7 +3630,7 @@ static int sp_3072_add_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_18(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_18(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3574,7 +3657,8 @@ static int sp_3072_sub_18(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_18(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit* z0 = r;
     sp_digit z1[18];
@@ -3596,7 +3680,7 @@ static void sp_3072_mul_18(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_18(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_18(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z1[18];
@@ -3617,7 +3701,7 @@ static void sp_3072_sqr_18(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3646,7 +3730,7 @@ static int sp_3072_sub_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_36(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_36(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3675,7 +3759,8 @@ static int sp_3072_add_36(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_54(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit p0[36];
     sp_digit p1[36];
@@ -3723,7 +3808,7 @@ static void sp_3072_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
 {
     sp_digit p0[36];
     sp_digit p1[36];
@@ -3768,7 +3853,7 @@ static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3785,7 +3870,7 @@ static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3818,7 +3903,7 @@ static int sp_3072_add_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3836,7 +3921,7 @@ static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3869,7 +3954,8 @@ static int sp_3072_sub_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_54(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int128_t c;
@@ -3899,7 +3985,7 @@ static void sp_3072_mul_54(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int128_t c;
@@ -3936,7 +4022,7 @@ static void sp_3072_sqr_54(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3953,7 +4039,7 @@ static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -3983,7 +4069,7 @@ static int sp_3072_add_27(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -4001,7 +4087,7 @@ static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -4031,7 +4117,8 @@ static int sp_3072_sub_27(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_27(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_27(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int128_t c;
@@ -4063,7 +4150,8 @@ static void sp_3072_mul_27(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_27(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_27(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j;
     int128_t t[54];
@@ -4087,7 +4175,7 @@ static void sp_3072_mul_27(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int128_t c;
@@ -4121,7 +4209,7 @@ static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_27(sp_digit* r, const sp_digit* a)
 {
     int i, j;
     int128_t t[54];
@@ -4280,7 +4368,7 @@ static void sp_3072_cond_sub_27(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_3072_mul_add_27(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_add_27(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -4471,7 +4559,8 @@ static void sp_3072_mont_sqr_27(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int128_t tb = b;
@@ -5075,7 +5164,7 @@ static void sp_3072_cond_sub_54(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_3072_mul_add_54(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_add_54(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -5285,7 +5374,8 @@ static void sp_3072_mont_sqr_54(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_3072_mul_d_54(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_3072_mul_d_54(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int128_t tb = b;
@@ -6544,6 +6634,7 @@ int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen,
     return err;
 #endif
 }
+
 #endif /* WOLFSSL_HAVE_SP_DH */
 
 #endif /* WOLFSSL_SP_NO_3072 */
@@ -7012,7 +7103,7 @@ static void sp_256_cond_sub_5(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_256_mul_add_5(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_256_mul_add_5(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
@@ -7122,7 +7213,8 @@ static void sp_256_mont_reduce_5(sp_digit* a, sp_digit* m, sp_digit mp)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_256_mul_5(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_256_mul_5(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int i, j, k;
     int128_t c;
@@ -7154,7 +7246,8 @@ static void sp_256_mul_5(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_256_mul_5(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_256_mul_5(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     int128_t t0   = ((int128_t)a[ 0]) * b[ 0];
     int128_t t1   = ((int128_t)a[ 0]) * b[ 1]
@@ -7217,7 +7310,7 @@ static void sp_256_mont_mul_5(sp_digit* r, sp_digit* a, sp_digit* b,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_sqr_5(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_256_sqr_5(sp_digit* r, const sp_digit* a)
 {
     int i, j, k;
     int128_t c;
@@ -7251,7 +7344,7 @@ static void sp_256_sqr_5(sp_digit* r, const sp_digit* a)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_sqr_5(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_256_sqr_5(sp_digit* r, const sp_digit* a)
 {
     int128_t t0   =  ((int128_t)a[ 0]) * a[ 0];
     int128_t t1   = (((int128_t)a[ 0]) * a[ 1]) * 2;
@@ -7442,7 +7535,7 @@ static void sp_256_map_5(sp_point* r, sp_point* p, sp_digit* t)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_256_add_5(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_256_add_5(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -7459,7 +7552,7 @@ static int sp_256_add_5(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_256_add_5(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_256_add_5(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     r[ 0] = a[ 0] + b[ 0];
@@ -7527,7 +7620,7 @@ static void sp_256_mont_tpl_5(sp_digit* r, sp_digit* a, sp_digit* m)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_256_sub_5(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_256_sub_5(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     int i;
@@ -7545,7 +7638,7 @@ static int sp_256_sub_5(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static int sp_256_sub_5(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static int sp_256_sub_5(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     r[ 0] = a[ 0] - b[ 0];
@@ -7604,7 +7697,7 @@ static void sp_256_mont_sub_5(sp_digit* r, sp_digit* a, sp_digit* b,
  * r  Result of shift.
  * a  Number to shift.
  */
-static void sp_256_rshift1_5(sp_digit* r, sp_digit* a)
+SP_NOINLINE static void sp_256_rshift1_5(sp_digit* r, sp_digit* a)
 {
 #ifdef WOLFSSL_SP_SMALL
     int i;
@@ -10308,7 +10401,7 @@ static int sp_256_iszero_5(const sp_digit* a)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_add_one_5(sp_digit* a)
+SP_NOINLINE static void sp_256_add_one_5(sp_digit* a)
 {
     a[0]++;
     sp_256_norm_5(a);
@@ -10573,7 +10666,8 @@ int sp_ecc_secret_gen_256(mp_int* priv, ecc_point* pub, byte* out,
  * a  A single precision integer.
  * b  A scalar.
  */
-static void sp_256_mul_d_5(sp_digit* r, const sp_digit* a, const sp_digit b)
+SP_NOINLINE static void sp_256_mul_d_5(sp_digit* r, const sp_digit* a,
+    const sp_digit b)
 {
 #ifdef WOLFSSL_SP_SMALL
     int128_t tb = b;
diff --git a/wolfcrypt/src/sp_int.c b/wolfcrypt/src/sp_int.c
index a732f54d0..74a37b6be 100644
--- a/wolfcrypt/src/sp_int.c
+++ b/wolfcrypt/src/sp_int.c
@@ -436,6 +436,103 @@ int sp_cmp_d(sp_int *a, sp_int_digit d)
     return MP_EQ;
 }
 
+/* Left shift the number by number of bits.
+ * Bits may be larger than the word size.
+ *
+ * a  SP integer.
+ * n  Number of bits to shift.
+ * returns MP_OKAY always.
+ */
+static int sp_lshb(sp_int* a, int n)
+{
+    int i;
+
+    if (n >= SP_WORD_SIZE) {
+        sp_lshd(a, n / SP_WORD_SIZE);
+        n %= SP_WORD_SIZE;
+    }
+
+    if (n == 0)
+        return MP_OKAY;
+
+    a->dp[a->used] = 0;
+    for (i = a->used - 1; i >= 0; i--) {
+        a->dp[i+1] |= a->dp[i] >> (SP_WORD_SIZE - n);
+        a->dp[i] = a->dp[i] << n;
+    }
+    if (a->dp[a->used] != 0)
+        a->used++;
+
+    return MP_OKAY;
+}
+
+/* Subtract two large numbers into result: r = a - b
+ * a must be greater than b.
+ *
+ * a  SP integer.
+ * b  SP integer.
+ * r  SP integer.
+ * returns MP_OKAY always.
+ */
+static int sp_sub(sp_int* a, sp_int* b, sp_int* r)
+{
+    int i;
+    sp_int_digit c = 0;
+    sp_int_digit t;
+
+    for (i = 0; i < a->used && i < b->used; i++) {
+        t = a->dp[i] - b->dp[i] - c;
+        if (c == 0)
+            c = t > a->dp[i];
+        else
+            c = t >= a->dp[i];
+        r->dp[i] = t;
+    }
+    for (; i < a->used; i++) {
+        r->dp[i] = a->dp[i] - c;
+        c = r->dp[i] == (sp_int_digit)-1;
+    }
+    r->used = i;
+    sp_clamp(r);
+
+    return MP_OKAY;
+}
+
+/* Calculate the r = a mod m.
+ *
+ * a  SP integer.
+ * m  SP integer.
+ * r  SP integer.
+ * returns MP_OKAY always.
+ */
+int sp_mod(sp_int* a, sp_int* m, sp_int* r)
+{
+    sp_int t;
+    int mBits = sp_count_bits(m);
+    int rBits;
+
+    if (a != r)
+        sp_copy(a, r);
+    sp_init(&t);
+
+    rBits = sp_count_bits(r);
+    while (rBits > mBits) {
+        sp_copy(m, &t);
+        sp_lshb(&t, rBits - mBits);
+
+        if (sp_cmp(&t, r) == MP_GT) {
+            sp_copy(m, &t);
+            sp_lshb(&t, rBits - mBits - 1);
+        }
+        sp_sub(r, &t, r);
+
+        rBits = sp_count_bits(r);
+    }
+    if (sp_cmp(r, m) != MP_LT)
+        sp_sub(r, m, r);
+
+    return MP_OKAY;
+}
 
 #if defined(USE_FAST_MATH) || !defined(NO_BIG_INT)
 /* Clear all data in the big number and sets value to zero.
@@ -493,23 +590,33 @@ int sp_lshd(sp_int* a, int s)
 
     XMEMMOVE(a->dp + s, a->dp, a->used * SP_INT_DIGITS);
     a->used += s;
+    XMEMSET(a->dp, 0, s * sizeof(sp_int_digit));
 
     return MP_OKAY;
 }
 #endif
 
 #ifndef NO_PWDBASED
+/* Add two large numbers into result: r = a + b
+ *
+ * a  SP integer.
+ * b  SP integer.
+ * r  SP integer.
+ * returns MP_OKAY always.
+ */
 int sp_add(sp_int* a, sp_int* b, sp_int* r)
 {
     int i;
     sp_digit c = 0;
+    sp_digit t;
 
     for (i = 0; i < a->used && i < b->used; i++) {
-        r->dp[i] = a->dp[i] + b->dp[i] + c;
+        t = a->dp[i] + b->dp[i] + c;
         if (c == 0)
-            c = r->dp[i] < a->dp[i];
+            c = t < a->dp[i];
         else
-            c = r->dp[i] <= a->dp[i];
+            c = t <= a->dp[i];
+        r->dp[i] = t;
     }
     for (; i < a->used; i++) {
         r->dp[i] = a->dp[i] + c;
diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c
index d1fcdbd71..79f26f0c9 100644
--- a/wolfcrypt/src/sp_x86_64.c
+++ b/wolfcrypt/src/sp_x86_64.c
@@ -186,7 +186,8 @@ static void sp_2048_to_bin(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit tmp[16];
 
@@ -1799,7 +1800,7 @@ static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a)
 {
     sp_digit tmp[16];
 
@@ -2856,7 +2857,7 @@ static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a)
  * a   First number to multiply.
  * b   Second number to multiply.
  */
-static void sp_2048_mul_avx2_16(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_avx2_16(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit tmp[2*16];
@@ -4504,7 +4505,7 @@ static void sp_2048_mul_avx2_16(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_avx2_16(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_avx2_16(sp_digit* r, const sp_digit* a)
 {
     sp_digit tmp[32];
 
@@ -5508,7 +5509,7 @@ static void sp_2048_sqr_avx2_16(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit c = 0;
@@ -5576,7 +5577,8 @@ static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a,
  * a  A single precision integer and result.
  * b  A single precision integer.
  */
-static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit c = 0;
 
@@ -5724,7 +5726,7 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit c = 0;
@@ -5870,7 +5872,7 @@ static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit* z0 = r;
@@ -5902,7 +5904,7 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z2[32];
@@ -5931,7 +5933,7 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_2048_mul_avx2_32(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_avx2_32(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit* z0 = r;
@@ -5965,7 +5967,7 @@ static void sp_2048_mul_avx2_32(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_2048_sqr_avx2_32(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_2048_sqr_avx2_32(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z2[32];
@@ -6017,7 +6019,8 @@ static void sp_2048_mont_setup(sp_digit* a, sp_digit* rho)
  * a  A single precision integer and result.
  * b  A single precision integer.
  */
-static sp_digit sp_2048_sub_in_place_16(sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_digit sp_2048_sub_in_place_16(sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit c = 0;
 
@@ -6473,7 +6476,7 @@ static void sp_2048_mont_sqr_16(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_2048_mul_d_16(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_d_16(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -6616,7 +6619,7 @@ static void sp_2048_mul_d_16(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_2048_mul_d_avx2_16(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_d_avx2_16(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -8074,7 +8077,7 @@ static void sp_2048_mont_sqr_32(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -8345,7 +8348,7 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_2048_mul_d_avx2_32(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_2048_mul_d_avx2_32(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -10167,7 +10170,8 @@ static void sp_3072_to_bin(sp_digit* r, byte* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit tmp[24];
 
@@ -13732,7 +13736,7 @@ static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a)
 {
     sp_digit tmp[24];
 
@@ -15901,7 +15905,7 @@ static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a)
  * a   First number to multiply.
  * b   Second number to multiply.
  */
-static void sp_3072_mul_avx2_24(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_avx2_24(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit tmp[2*24];
@@ -19517,7 +19521,7 @@ static void sp_3072_mul_avx2_24(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_avx2_24(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_avx2_24(sp_digit* r, const sp_digit* a)
 {
     sp_digit tmp[48];
 
@@ -21591,7 +21595,7 @@ static void sp_3072_sqr_avx2_24(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit c = 0;
@@ -21683,7 +21687,8 @@ static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a,
  * a  A single precision integer and result.
  * b  A single precision integer.
  */
-static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit c = 0;
 
@@ -21895,7 +21900,7 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit c = 0;
@@ -22089,7 +22094,7 @@ static void sp_3072_mask_24(sp_digit* r, sp_digit* a, sp_digit m)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit* z0 = r;
@@ -22121,7 +22126,7 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z2[48];
@@ -22150,7 +22155,7 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_3072_mul_avx2_48(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_avx2_48(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit* z0 = r;
@@ -22184,7 +22189,7 @@ static void sp_3072_mul_avx2_48(sp_digit* r, const sp_digit* a,
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_3072_sqr_avx2_48(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_3072_sqr_avx2_48(sp_digit* r, const sp_digit* a)
 {
     sp_digit* z0 = r;
     sp_digit z2[48];
@@ -22236,7 +22241,8 @@ static void sp_3072_mont_setup(sp_digit* a, sp_digit* rho)
  * a  A single precision integer and result.
  * b  A single precision integer.
  */
-static sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit c = 0;
 
@@ -22860,7 +22866,7 @@ static void sp_3072_mont_sqr_24(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_3072_mul_d_24(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_d_24(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -23067,7 +23073,7 @@ static void sp_3072_mul_d_24(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_3072_mul_d_avx2_24(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_d_avx2_24(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -24951,7 +24957,7 @@ static void sp_3072_mont_sqr_48(sp_digit* r, sp_digit* a, sp_digit* m,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -25350,7 +25356,7 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_3072_mul_d_avx2_48(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_3072_mul_d_avx2_48(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -27845,7 +27851,7 @@ static sp_digit sp_256_cond_sub_4(sp_digit* r, sp_digit* a, sp_digit* b,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_256_sub_4(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static sp_digit sp_256_sub_4(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit c = 0;
@@ -28637,7 +28643,7 @@ static void sp_256_mont_sub_4(sp_digit* r, sp_digit* a, sp_digit* b,
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_256_div2_4(sp_digit* r, sp_digit* a, sp_digit* m)
+SP_NOINLINE static void sp_256_div2_4(sp_digit* r, sp_digit* a, sp_digit* m)
 {
     __asm__ __volatile__ (
         "movq	0(%[a]), %%rax\n\t"
@@ -44674,7 +44680,7 @@ int sp_ecc_secret_gen_256(mp_int* priv, ecc_point* pub, byte* out,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static sp_digit sp_256_add_4(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static sp_digit sp_256_add_4(sp_digit* r, const sp_digit* a,
         const sp_digit* b)
 {
     sp_digit c = 0;
@@ -44709,7 +44715,8 @@ static sp_digit sp_256_add_4(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_256_mul_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit tmp[4];
 
@@ -44940,7 +44947,8 @@ SP_NOINLINE static void sp_256_mul_avx2_4(sp_digit* r, const sp_digit* a,
  * a  A single precision integer and result.
  * b  A single precision integer.
  */
-static sp_digit sp_256_sub_in_place_4(sp_digit* a, const sp_digit* b)
+SP_NOINLINE static sp_digit sp_256_sub_in_place_4(sp_digit* a,
+    const sp_digit* b)
 {
     sp_digit c = 0;
 
@@ -44976,7 +44984,7 @@ static sp_digit sp_256_sub_in_place_4(sp_digit* a, const sp_digit* b)
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_256_mul_d_4(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_256_mul_d_4(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -45023,7 +45031,7 @@ static void sp_256_mul_d_4(sp_digit* r, const sp_digit* a,
  * a  A single precision integer.
  * b  A single precision digit.
  */
-static void sp_256_mul_d_avx2_4(sp_digit* r, const sp_digit* a,
+SP_NOINLINE static void sp_256_mul_d_avx2_4(sp_digit* r, const sp_digit* a,
         const sp_digit b)
 {
     __asm__ __volatile__ (
@@ -45169,7 +45177,7 @@ static INLINE int sp_256_mod_4(sp_digit* r, sp_digit* a, sp_digit* m)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_sqr_4(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_256_sqr_4(sp_digit* r, const sp_digit* a)
 {
     sp_digit tmp[4];
 
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index b27e0b51d..543a400f0 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -10282,6 +10282,113 @@ exit_rsa:
 
 #ifndef NO_DH
 
+static int dh_fips_generate_test(WC_RNG *rng)
+{
+    int    ret = 0;
+    DhKey  key;
+    static byte p[] = {
+        0xc5, 0x7c, 0xa2, 0x4f, 0x4b, 0xd6, 0x8c, 0x3c,
+        0xda, 0xc7, 0xba, 0xaa, 0xea, 0x2e, 0x5c, 0x1e,
+        0x18, 0xb2, 0x7b, 0x8c, 0x55, 0x65, 0x9f, 0xea,
+        0xe0, 0xa1, 0x36, 0x53, 0x2b, 0x36, 0xe0, 0x4e,
+        0x3e, 0x64, 0xa9, 0xe4, 0xfc, 0x8f, 0x32, 0x62,
+        0x97, 0xe4, 0xbe, 0xf7, 0xc1, 0xde, 0x07, 0x5a,
+        0x89, 0x28, 0xf3, 0xfe, 0x4f, 0xfe, 0x68, 0xbc,
+        0xfb, 0x0a, 0x7c, 0xa4, 0xb3, 0x14, 0x48, 0x89,
+        0x9f, 0xaf, 0xb8, 0x43, 0xe2, 0xa0, 0x62, 0x5c,
+        0xb4, 0x88, 0x3f, 0x06, 0x50, 0x11, 0xfe, 0x65,
+        0x8d, 0x49, 0xd2, 0xf5, 0x4b, 0x74, 0x79, 0xdb,
+        0x06, 0x62, 0x92, 0x89, 0xed, 0xda, 0xcb, 0x87,
+        0x37, 0x16, 0xd2, 0xa1, 0x7a, 0xe8, 0xde, 0x92,
+        0xee, 0x3e, 0x41, 0x4a, 0x91, 0x5e, 0xed, 0xf3,
+        0x6c, 0x6b, 0x7e, 0xfd, 0x15, 0x92, 0x18, 0xfc,
+        0xa7, 0xac, 0x42, 0x85, 0x57, 0xe9, 0xdc, 0xda,
+        0x55, 0xc9, 0x8b, 0x28, 0x9e, 0xc1, 0xc4, 0x46,
+        0x4d, 0x88, 0xed, 0x62, 0x8e, 0xdb, 0x3f, 0xb9,
+        0xd7, 0xc8, 0xe3, 0xcf, 0xb8, 0x34, 0x2c, 0xd2,
+        0x6f, 0x28, 0x06, 0x41, 0xe3, 0x66, 0x8c, 0xfc,
+        0x72, 0xff, 0x26, 0x3b, 0x6b, 0x6c, 0x6f, 0x73,
+        0xde, 0xf2, 0x90, 0x29, 0xe0, 0x61, 0x32, 0xc4,
+        0x12, 0x74, 0x09, 0x52, 0xec, 0xf3, 0x1b, 0xa6,
+        0x45, 0x98, 0xac, 0xf9, 0x1c, 0x65, 0x8e, 0x3a,
+        0x91, 0x84, 0x4b, 0x23, 0x8a, 0xb2, 0x3c, 0xc9,
+        0xfa, 0xea, 0xf1, 0x38, 0xce, 0xd8, 0x05, 0xe0,
+        0xfa, 0x44, 0x68, 0x1f, 0xeb, 0xd9, 0x57, 0xb8,
+        0x4a, 0x97, 0x5b, 0x88, 0xc5, 0xf1, 0xbb, 0xb0,
+        0x49, 0xc3, 0x91, 0x7c, 0xd3, 0x13, 0xb9, 0x47,
+        0xbb, 0x91, 0x8f, 0xe5, 0x26, 0x07, 0xab, 0xa9,
+        0xc5, 0xd0, 0x3d, 0x95, 0x41, 0x26, 0x92, 0x9d,
+        0x13, 0x67, 0xf2, 0x7e, 0x11, 0x88, 0xdc, 0x2d
+    };
+    static byte g[] = {
+        0x4a, 0x1a, 0xf3, 0xa4, 0x92, 0xe9, 0xee, 0x74,
+        0x6e, 0x57, 0xd5, 0x8c, 0x2c, 0x5b, 0x41, 0x41,
+        0x5e, 0xd4, 0x55, 0x19, 0xdc, 0xd9, 0x32, 0x91,
+        0xf7, 0xfd, 0xc2, 0x57, 0xff, 0x03, 0x14, 0xdb,
+        0xf1, 0xb7, 0x60, 0x0c, 0x43, 0x59, 0x3f, 0xff,
+        0xac, 0xf1, 0x80, 0x9a, 0x15, 0x6f, 0xd8, 0x6e,
+        0xb7, 0x85, 0x18, 0xc8, 0xec, 0x4e, 0x59, 0x4a,
+        0xe2, 0x91, 0x43, 0x4c, 0xeb, 0x95, 0xb6, 0x2e,
+        0x9a, 0xea, 0x53, 0x68, 0x80, 0x64, 0x69, 0x40,
+        0xf9, 0xec, 0xbd, 0x85, 0x89, 0x26, 0x97, 0x67,
+        0xaf, 0xb0, 0xad, 0x00, 0x1b, 0xd4, 0xfd, 0x94,
+        0xd3, 0xe9, 0x92, 0xb1, 0xb4, 0xbc, 0x5a, 0xaa,
+        0x92, 0x80, 0x89, 0x3b, 0x39, 0x05, 0x6c, 0x22,
+        0x26, 0xfe, 0x5a, 0x28, 0x6c, 0x37, 0x50, 0x5a,
+        0x38, 0x99, 0xcf, 0xf3, 0xc1, 0x96, 0x45, 0xdc,
+        0x01, 0xcb, 0x20, 0x87, 0xa5, 0x00, 0x8c, 0xf5,
+        0x4d, 0xc2, 0xef, 0xb8, 0x9b, 0xd1, 0x87, 0xbe,
+        0xed, 0xd5, 0x0a, 0x29, 0x15, 0x34, 0x59, 0x4c,
+        0x3a, 0x05, 0x22, 0x05, 0x44, 0x4f, 0x9f, 0xc8,
+        0x47, 0x12, 0x24, 0x8e, 0xa8, 0x79, 0xe4, 0x67,
+        0xba, 0x4d, 0x5b, 0x75, 0x56, 0x95, 0xeb, 0xe8,
+        0x8a, 0xfa, 0x8e, 0x01, 0x8c, 0x1b, 0x74, 0x63,
+        0xd9, 0x2f, 0xf7, 0xd3, 0x44, 0x8f, 0xa8, 0xf5,
+        0xaf, 0x6c, 0x4f, 0xdb, 0xe7, 0xc9, 0x6c, 0x71,
+        0x22, 0xa3, 0x1d, 0xf1, 0x40, 0xb2, 0xe0, 0x9a,
+        0xb6, 0x72, 0xc9, 0xc0, 0x13, 0x16, 0xa2, 0x4a,
+        0xe1, 0x92, 0xc7, 0x54, 0x23, 0xab, 0x9d, 0xa1,
+        0xa1, 0xe5, 0x0b, 0xed, 0xba, 0xe8, 0x84, 0x37,
+        0xb2, 0xe7, 0xfe, 0x32, 0x8d, 0xfa, 0x1c, 0x53,
+        0x77, 0x97, 0xc7, 0xf3, 0x48, 0xc9, 0xdb, 0x2d,
+        0x75, 0x52, 0x9d, 0x42, 0x51, 0x78, 0x62, 0x68,
+        0x05, 0x45, 0x15, 0xf8, 0xa2, 0x4e, 0xf3, 0x0b
+    };
+    static byte q[] = {
+        0xe0, 0x35, 0x37, 0xaf, 0xb2, 0x50, 0x91, 0x8e,
+        0xf2, 0x62, 0x2b, 0xd9, 0x9f, 0x6c, 0x11, 0x75,
+        0xec, 0x24, 0x5d, 0x78, 0x59, 0xe7, 0x8d, 0xb5,
+        0x40, 0x52, 0xed, 0x41
+    };
+    byte   priv[256];
+    byte   pub[256];
+    word32 privSz = sizeof(priv);
+    word32 pubSz = sizeof(pub);
+
+    ret = wc_InitDhKey_ex(&key, HEAP_HINT, devId);
+    if (ret != 0)
+        return -5725;
+
+    ret = wc_DhSetKey_ex(&key, p, sizeof(p), g, sizeof(g), q, sizeof(q));
+    if (ret != 0) {
+        ERROR_OUT(-5726, exit_gen_test);
+    }
+
+    /* Use API. */
+    ret = wc_DhGenerateKeyPair(&key, rng, priv, &privSz, pub, &pubSz);
+#if defined(WOLFSSL_ASYNC_CRYPT)
+    ret = wc_AsyncWait(ret, &key.asyncDev, WC_ASYNC_FLAG_NONE);
+#endif
+    if (ret != 0) {
+        ret = -5727;
+    }
+
+exit_gen_test:
+    wc_FreeDhKey(&key);
+
+    return ret;
+}
+
 static int dh_generate_test(WC_RNG *rng)
 {
     int    ret = 0;
@@ -10469,6 +10576,8 @@ int dh_test(void)
     }
 
     ret = dh_generate_test(&rng);
+    if (ret == 0)
+        ret = dh_fips_generate_test(&rng);
 
 done:
 
diff --git a/wolfssl/wolfcrypt/sp_int.h b/wolfssl/wolfcrypt/sp_int.h
index 11cc6cdd5..51b9e2fb8 100644
--- a/wolfssl/wolfcrypt/sp_int.h
+++ b/wolfssl/wolfcrypt/sp_int.h
@@ -128,6 +128,7 @@ MP_API void sp_clamp(sp_int* a);
 MP_API int sp_grow(sp_int* a, int l);
 MP_API int sp_sub_d(sp_int* a, sp_int_digit d, sp_int* r);
 MP_API int sp_cmp_d(sp_int* a, sp_int_digit d);
+MP_API int sp_mod(sp_int* a, sp_int* m, sp_int* r);
 MP_API void sp_zero(sp_int* a);
 MP_API int sp_add_d(sp_int* a, sp_int_digit d, sp_int* r);
 MP_API int sp_lshd(sp_int* a, int s);
@@ -173,6 +174,7 @@ typedef sp_digit mp_digit;
 #define mp_grow                 sp_grow
 #define mp_sub_d                sp_sub_d
 #define mp_cmp_d                sp_cmp_d
+#define mp_mod                  sp_mod
 #define mp_zero                 sp_zero
 #define mp_add_d                sp_add_d
 #define mp_lshd                 sp_lshd