diff --git a/wolfcrypt/src/fp_mont_small.i b/wolfcrypt/src/fp_mont_small.i
index bd24d0f1b..19a4042c9 100644
--- a/wolfcrypt/src/fp_mont_small.i
+++ b/wolfcrypt/src/fp_mont_small.i
@@ -23,13 +23,26 @@
 
 #ifdef TFM_SMALL_MONT_SET
 /* computes x/R == x (mod N) via Montgomery Reduction */
-void fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)
+int fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)
 {
-   fp_digit c[FP_SIZE], *_c, *tmpm, mu, cy;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit c[FP_SIZE];
+#else
+   fp_digit *c;
+#endif
+   fp_digit *_c, *tmpm, mu, cy;
    int      oldused, x, y, pa;
 
+#ifdef WOLFSSL_SMALL_STACK
+   /* only allocate space for what's needed for window plus res */
+   c = (fp_digit*)XMALLOC(sizeof(fp_digit)*FP_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (c == NULL) {
+      return FP_MEM;
+   }
+#endif
+
    /* now zero the buff */
-   XMEMSET(c, 0, sizeof(c));
+   XMEMSET(c, 0, sizeof(fp_digit)*(FP_SIZE));
 
    pa = m->used;
 
@@ -3851,6 +3864,11 @@ void fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)
   if (fp_cmp_mag (a, m) != FP_LT) {
     s_fp_sub (a, m, a);
   }
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(c, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+  return FP_OKAY;
 }
 
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_12.i b/wolfcrypt/src/fp_mul_comba_12.i
index 7cdcf596f..47f4e594f 100644
--- a/wolfcrypt/src/fp_mul_comba_12.i
+++ b/wolfcrypt/src/fp_mul_comba_12.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL12
-void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[24];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[24];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 24, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 12 * sizeof(fp_digit));
    XMEMCPY(at+12, B->dp, 12 * sizeof(fp_digit));
@@ -127,5 +138,10 @@ void fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_17.i b/wolfcrypt/src/fp_mul_comba_17.i
index 503560e6f..4f8c718c8 100644
--- a/wolfcrypt/src/fp_mul_comba_17.i
+++ b/wolfcrypt/src/fp_mul_comba_17.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL17
-void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[34];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[34];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 34, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 17 * sizeof(fp_digit));
    XMEMCPY(at+17, B->dp, 17 * sizeof(fp_digit));
@@ -167,5 +178,10 @@ void fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_20.i b/wolfcrypt/src/fp_mul_comba_20.i
index bd008b15c..e30e93093 100644
--- a/wolfcrypt/src/fp_mul_comba_20.i
+++ b/wolfcrypt/src/fp_mul_comba_20.i
@@ -21,10 +21,21 @@
 
 
 #ifdef TFM_MUL20
-void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[40];
-   
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[40];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 40, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
+
    XMEMCPY(at, A->dp, 20 * sizeof(fp_digit));
    XMEMCPY(at+20, B->dp, 20 * sizeof(fp_digit));
    COMBA_START;
@@ -190,5 +201,10 @@ void fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_24.i b/wolfcrypt/src/fp_mul_comba_24.i
index b37ee7822..b2f915bd8 100644
--- a/wolfcrypt/src/fp_mul_comba_24.i
+++ b/wolfcrypt/src/fp_mul_comba_24.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL24
-void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[48];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[48];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 48, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 24 * sizeof(fp_digit));
    XMEMCPY(at+24, B->dp, 24 * sizeof(fp_digit));
@@ -223,5 +234,10 @@ void fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_28.i b/wolfcrypt/src/fp_mul_comba_28.i
index 0e6a65cb9..cea5c2fd7 100644
--- a/wolfcrypt/src/fp_mul_comba_28.i
+++ b/wolfcrypt/src/fp_mul_comba_28.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL28
-void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[56];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[56];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 56, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 28 * sizeof(fp_digit));
    XMEMCPY(at+28, B->dp, 28 * sizeof(fp_digit));
@@ -255,5 +266,10 @@ void fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_3.i b/wolfcrypt/src/fp_mul_comba_3.i
index a628b503d..1962e030b 100644
--- a/wolfcrypt/src/fp_mul_comba_3.i
+++ b/wolfcrypt/src/fp_mul_comba_3.i
@@ -22,7 +22,7 @@
 
 
 #ifdef TFM_MUL3
-void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C)
 {
    fp_digit c0, c1, c2, at[6];
 
@@ -55,5 +55,7 @@ void fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_32.i b/wolfcrypt/src/fp_mul_comba_32.i
index 9a97af04f..0d82c7f3b 100644
--- a/wolfcrypt/src/fp_mul_comba_32.i
+++ b/wolfcrypt/src/fp_mul_comba_32.i
@@ -22,10 +22,21 @@
 
 
 #ifdef TFM_MUL32
-void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[64];
    int out_size;
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[64];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    out_size = A->used + B->used;
    XMEMCPY(at, A->dp, 32 * sizeof(fp_digit));
@@ -190,7 +201,7 @@ void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
    COMBA_STORE(C->dp[38]);
 
    /* early out at 40 digits, 40*32==1280, or two 640 bit operands */
-   if (out_size <= 40) { COMBA_STORE2(C->dp[39]); C->used = 40; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return; }
+   if (out_size <= 40) { COMBA_STORE2(C->dp[39]); C->used = 40; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return FP_OKAY; }
 
    /* 39 */
    COMBA_FORWARD;
@@ -226,7 +237,7 @@ void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
    COMBA_STORE(C->dp[46]);
 
    /* early out at 48 digits, 48*32==1536, or two 768 bit operands */
-   if (out_size <= 48) { COMBA_STORE2(C->dp[47]); C->used = 48; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return; }
+   if (out_size <= 48) { COMBA_STORE2(C->dp[47]); C->used = 48; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return FP_OKAY; }
 
    /* 47 */
    COMBA_FORWARD;
@@ -262,7 +273,7 @@ void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
    COMBA_STORE(C->dp[54]);
 
    /* early out at 56 digits, 56*32==1792, or two 896 bit operands */
-   if (out_size <= 56) { COMBA_STORE2(C->dp[55]); C->used = 56; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return; }
+   if (out_size <= 56) { COMBA_STORE2(C->dp[55]); C->used = 56; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return FP_OKAY; }
 
    /* 55 */
    COMBA_FORWARD;
@@ -301,5 +312,10 @@ void fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_4.i b/wolfcrypt/src/fp_mul_comba_4.i
index 536f5ffa1..0619c1dff 100644
--- a/wolfcrypt/src/fp_mul_comba_4.i
+++ b/wolfcrypt/src/fp_mul_comba_4.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL4
-void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[8];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[8];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 8, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 4 * sizeof(fp_digit));
    XMEMCPY(at+4, B->dp, 4 * sizeof(fp_digit));
@@ -63,5 +74,10 @@ void fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_48.i b/wolfcrypt/src/fp_mul_comba_48.i
index 20653bdfc..7325cb010 100644
--- a/wolfcrypt/src/fp_mul_comba_48.i
+++ b/wolfcrypt/src/fp_mul_comba_48.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL48
-void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[96];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[96];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 96, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 48 * sizeof(fp_digit));
    XMEMCPY(at+48, B->dp, 48 * sizeof(fp_digit));
@@ -415,5 +426,10 @@ void fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_6.i b/wolfcrypt/src/fp_mul_comba_6.i
index a453c988e..4c1263515 100644
--- a/wolfcrypt/src/fp_mul_comba_6.i
+++ b/wolfcrypt/src/fp_mul_comba_6.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL6
-void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[12];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[12];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 12, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 6 * sizeof(fp_digit));
    XMEMCPY(at+6, B->dp, 6 * sizeof(fp_digit));
@@ -79,5 +90,10 @@ void fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_64.i b/wolfcrypt/src/fp_mul_comba_64.i
index 2f2da7911..da50d47e0 100644
--- a/wolfcrypt/src/fp_mul_comba_64.i
+++ b/wolfcrypt/src/fp_mul_comba_64.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL64
-void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[128];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[128];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 128, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 64 * sizeof(fp_digit));
    XMEMCPY(at+64, B->dp, 64 * sizeof(fp_digit));
@@ -543,5 +554,10 @@ void fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_7.i b/wolfcrypt/src/fp_mul_comba_7.i
index 84da50d2b..dac7ffbc6 100644
--- a/wolfcrypt/src/fp_mul_comba_7.i
+++ b/wolfcrypt/src/fp_mul_comba_7.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL7
-void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[14];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[14];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 14, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 7 * sizeof(fp_digit));
    XMEMCPY(at+7, B->dp, 7 * sizeof(fp_digit));
@@ -87,5 +98,10 @@ void fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
-}
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
 #endif
+   return FP_OKAY;
+}
+#ennif
diff --git a/wolfcrypt/src/fp_mul_comba_8.i b/wolfcrypt/src/fp_mul_comba_8.i
index 1c919f954..0a2afa73e 100644
--- a/wolfcrypt/src/fp_mul_comba_8.i
+++ b/wolfcrypt/src/fp_mul_comba_8.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL8
-void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[16];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[16];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 8 * sizeof(fp_digit));
    XMEMCPY(at+8, B->dp, 8 * sizeof(fp_digit));
@@ -95,5 +106,10 @@ void fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_9.i b/wolfcrypt/src/fp_mul_comba_9.i
index 175130e0a..a9ccc5070 100644
--- a/wolfcrypt/src/fp_mul_comba_9.i
+++ b/wolfcrypt/src/fp_mul_comba_9.i
@@ -22,9 +22,20 @@
 
 
 #ifdef TFM_MUL9
-void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[18];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[18];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 18, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
 
    XMEMCPY(at, A->dp, 9 * sizeof(fp_digit));
    XMEMCPY(at+9, B->dp, 9 * sizeof(fp_digit));
@@ -103,5 +114,10 @@ void fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C)
    C->sign = A->sign ^ B->sign;
    fp_clamp(C);
    COMBA_FINI;
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
diff --git a/wolfcrypt/src/fp_mul_comba_small_set.i b/wolfcrypt/src/fp_mul_comba_small_set.i
index 46cc4e5ad..0385eaeb3 100644
--- a/wolfcrypt/src/fp_mul_comba_small_set.i
+++ b/wolfcrypt/src/fp_mul_comba_small_set.i
@@ -22,9 +22,21 @@
 
 
 #if defined(TFM_SMALL_SET)
-void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)
 {
-   fp_digit c0, c1, c2, at[32];
+   fp_digit c0, c1, c2;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit at[32];
+#else
+   fp_digit *at;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 32, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (at == NULL)
+       return FP_MEM;
+#endif
+
    switch (MAX(A->used, B->used)) { 
 
    case 1:
@@ -1246,6 +1258,11 @@ void fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)
    default:
       break;
    }
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 
 #endif
diff --git a/wolfcrypt/src/fp_sqr_comba_12.i b/wolfcrypt/src/fp_sqr_comba_12.i
index 256071073..5e6bd869f 100644
--- a/wolfcrypt/src/fp_sqr_comba_12.i
+++ b/wolfcrypt/src/fp_sqr_comba_12.i
@@ -22,12 +22,24 @@
 
 
 #ifdef TFM_SQR12
-void fp_sqr_comba12(fp_int *A, fp_int *B)
+int fp_sqr_comba12(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[24], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
    fp_word tt;
 #endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[24];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 24, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
+
    a = A->dp;
    COMBA_START; 
 
@@ -154,6 +166,11 @@ void fp_sqr_comba12(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 24 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_17.i b/wolfcrypt/src/fp_sqr_comba_17.i
index 1d612de48..8800fc6ca 100644
--- a/wolfcrypt/src/fp_sqr_comba_17.i
+++ b/wolfcrypt/src/fp_sqr_comba_17.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR17
-void fp_sqr_comba17(fp_int *A, fp_int *B)
+int fp_sqr_comba17(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[34], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
    fp_word tt;
 #endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[34];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 34, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -205,6 +216,11 @@ void fp_sqr_comba17(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 34 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_20.i b/wolfcrypt/src/fp_sqr_comba_20.i
index fbe929f5b..482f030f5 100644
--- a/wolfcrypt/src/fp_sqr_comba_20.i
+++ b/wolfcrypt/src/fp_sqr_comba_20.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR20
-void fp_sqr_comba20(fp_int *A, fp_int *B)
+int fp_sqr_comba20(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[40], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
-   fp_word   tt;   
-#endif   
+   fp_word tt;
+#endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[40];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 40, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -235,6 +246,11 @@ void fp_sqr_comba20(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 40 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_24.i b/wolfcrypt/src/fp_sqr_comba_24.i
index 5e7d6ee96..b1a19f650 100644
--- a/wolfcrypt/src/fp_sqr_comba_24.i
+++ b/wolfcrypt/src/fp_sqr_comba_24.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR24
-void fp_sqr_comba24(fp_int *A, fp_int *B)
+int fp_sqr_comba24(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[48], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
-   fp_word   tt;   
-#endif   
+   fp_word tt;
+#endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[48];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 48, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -275,6 +286,11 @@ void fp_sqr_comba24(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 48 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_28.i b/wolfcrypt/src/fp_sqr_comba_28.i
index c1dc316cb..c90e8da32 100644
--- a/wolfcrypt/src/fp_sqr_comba_28.i
+++ b/wolfcrypt/src/fp_sqr_comba_28.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR28
-void fp_sqr_comba28(fp_int *A, fp_int *B)
+int fp_sqr_comba28(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[56], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
-   fp_word   tt;   
-#endif   
+   fp_word tt;
+#endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[56];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 56, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -315,6 +326,11 @@ void fp_sqr_comba28(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 56 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_3.i b/wolfcrypt/src/fp_sqr_comba_3.i
index bde53e05e..7691277b9 100644
--- a/wolfcrypt/src/fp_sqr_comba_3.i
+++ b/wolfcrypt/src/fp_sqr_comba_3.i
@@ -22,7 +22,7 @@
 
 
 #ifdef TFM_SQR3
-void fp_sqr_comba3(fp_int *A, fp_int *B)
+int fp_sqr_comba3(fp_int *A, fp_int *B)
 {
    fp_digit *a, b[6], c0, c1, c2;
 #ifdef TFM_ISO
@@ -65,6 +65,8 @@ void fp_sqr_comba3(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 6 * sizeof(fp_digit));
    fp_clamp(B);
+
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_32.i b/wolfcrypt/src/fp_sqr_comba_32.i
index 6042027e8..fa42b43ab 100644
--- a/wolfcrypt/src/fp_sqr_comba_32.i
+++ b/wolfcrypt/src/fp_sqr_comba_32.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR32
-void fp_sqr_comba32(fp_int *A, fp_int *B)
+int fp_sqr_comba32(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[64], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
-   fp_word   tt;   
-#endif   
+   fp_word tt;
+#endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[64];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -355,6 +366,11 @@ void fp_sqr_comba32(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 64 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_4.i b/wolfcrypt/src/fp_sqr_comba_4.i
index e00500db2..2b0dd41c0 100644
--- a/wolfcrypt/src/fp_sqr_comba_4.i
+++ b/wolfcrypt/src/fp_sqr_comba_4.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR4
-void fp_sqr_comba4(fp_int *A, fp_int *B)
+int fp_sqr_comba4(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[8], c0, c1, c2;
+   fp_digit *a, c0, c1, c2;
 #ifdef TFM_ISO
    fp_word tt;
 #endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[8];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 8, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -75,6 +86,11 @@ void fp_sqr_comba4(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 8 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_48.i b/wolfcrypt/src/fp_sqr_comba_48.i
index a3d127bdb..ddd2d0e3c 100644
--- a/wolfcrypt/src/fp_sqr_comba_48.i
+++ b/wolfcrypt/src/fp_sqr_comba_48.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR48
-void fp_sqr_comba48(fp_int *A, fp_int *B)
+int fp_sqr_comba48(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[96], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
-   fp_word   tt;   
-#endif   
+   fp_word tt;
+#endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[96];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 96, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -515,6 +526,11 @@ void fp_sqr_comba48(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 96 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_6.i b/wolfcrypt/src/fp_sqr_comba_6.i
index aa80e9774..f70947260 100644
--- a/wolfcrypt/src/fp_sqr_comba_6.i
+++ b/wolfcrypt/src/fp_sqr_comba_6.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR6
-void fp_sqr_comba6(fp_int *A, fp_int *B)
+int fp_sqr_comba6(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[12], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
    fp_word tt;
 #endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[12];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 12, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -95,6 +106,11 @@ void fp_sqr_comba6(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 12 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_64.i b/wolfcrypt/src/fp_sqr_comba_64.i
index a6e57a380..618765167 100644
--- a/wolfcrypt/src/fp_sqr_comba_64.i
+++ b/wolfcrypt/src/fp_sqr_comba_64.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR64
-void fp_sqr_comba64(fp_int *A, fp_int *B)
+int fp_sqr_comba64(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[128], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
-   fp_word   tt;   
-#endif   
+   fp_word tt;
+#endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[128];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 128, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -675,6 +686,11 @@ void fp_sqr_comba64(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 128 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_7.i b/wolfcrypt/src/fp_sqr_comba_7.i
index fdd6cf276..8f9e72a37 100644
--- a/wolfcrypt/src/fp_sqr_comba_7.i
+++ b/wolfcrypt/src/fp_sqr_comba_7.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR7
-void fp_sqr_comba7(fp_int *A, fp_int *B)
+int fp_sqr_comba7(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[14], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
    fp_word tt;
 #endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[14];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 14, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -105,6 +116,11 @@ void fp_sqr_comba7(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 14 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_8.i b/wolfcrypt/src/fp_sqr_comba_8.i
index 43a9192ca..f30629413 100644
--- a/wolfcrypt/src/fp_sqr_comba_8.i
+++ b/wolfcrypt/src/fp_sqr_comba_8.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR8
-void fp_sqr_comba8(fp_int *A, fp_int *B)
+int fp_sqr_comba8(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[16], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
-   fp_word   tt;   
-#endif   
+   fp_word tt;
+#endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[16];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -115,6 +126,11 @@ void fp_sqr_comba8(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 16 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_9.i b/wolfcrypt/src/fp_sqr_comba_9.i
index 18d8773f9..bde361728 100644
--- a/wolfcrypt/src/fp_sqr_comba_9.i
+++ b/wolfcrypt/src/fp_sqr_comba_9.i
@@ -22,12 +22,23 @@
 
 
 #ifdef TFM_SQR9
-void fp_sqr_comba9(fp_int *A, fp_int *B)
+int fp_sqr_comba9(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[18], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
    fp_word tt;
 #endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[18];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 18, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
 
    a = A->dp;
    COMBA_START; 
@@ -125,6 +136,11 @@ void fp_sqr_comba9(fp_int *A, fp_int *B)
    B->sign = FP_ZPOS;
    XMEMCPY(B->dp, b, 18 * sizeof(fp_digit));
    fp_clamp(B);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 #endif
 
diff --git a/wolfcrypt/src/fp_sqr_comba_small_set.i b/wolfcrypt/src/fp_sqr_comba_small_set.i
index 9b53ed9b4..8494850fc 100644
--- a/wolfcrypt/src/fp_sqr_comba_small_set.i
+++ b/wolfcrypt/src/fp_sqr_comba_small_set.i
@@ -22,12 +22,24 @@
 
 
 #if defined(TFM_SMALL_SET)
-void fp_sqr_comba_small(fp_int *A, fp_int *B)
+int fp_sqr_comba_small(fp_int *A, fp_int *B)
 {
-   fp_digit *a, b[32], c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
+   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
 #ifdef TFM_ISO
-   fp_word   tt;   
-#endif   
+   fp_word tt;
+#endif
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit b[32];
+#else
+   fp_digit *b;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 32, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (b == NULL)
+      return FP_MEM;
+#endif
+
    switch (A->used) { 
    case 1:
       a = A->dp;
@@ -1535,7 +1547,12 @@ void fp_sqr_comba_small(fp_int *A, fp_int *B)
 
    default:
       break;
-}
+   }
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 
 #endif /* TFM_SMALL_SET */
diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index 07d8b0db6..b1aea63aa 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -198,8 +198,9 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
 }
 
 /* c = a * b */
-void fp_mul(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul(fp_int *A, fp_int *B, fp_int *C)
 {
+    int   ret = 0;
     int   y, yy, oldused;
 
     oldused = C->used;
@@ -209,7 +210,7 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 
     /* call generic if we're out of range */
     if (y + yy > FP_SIZE) {
-       fp_mul_comba(A, B, C);
+       ret = fp_mul_comba(A, B, C);
        goto clean;
     }
 
@@ -221,102 +222,104 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 
 #if defined(TFM_MUL3) && FP_SIZE >= 6
         if (y <= 3) {
-           fp_mul_comba3(A,B,C);
+           ret = fp_mul_comba3(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL4) && FP_SIZE >= 8
         if (y == 4) {
-           fp_mul_comba4(A,B,C);
+           ret = fp_mul_comba4(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL6) && FP_SIZE >= 12
         if (y <= 6) {
-           fp_mul_comba6(A,B,C);
+           ret = fp_mul_comba6(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL7) && FP_SIZE >= 14
         if (y == 7) {
-           fp_mul_comba7(A,B,C);
+           ret = fp_mul_comba7(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL8) && FP_SIZE >= 16
         if (y == 8) {
-           fp_mul_comba8(A,B,C);
+           ret = fp_mul_comba8(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL9) && FP_SIZE >= 18
         if (y == 9) {
-           fp_mul_comba9(A,B,C);
+           ret = fp_mul_comba9(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL12) && FP_SIZE >= 24
         if (y <= 12) {
-           fp_mul_comba12(A,B,C);
+           ret = fp_mul_comba12(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL17) && FP_SIZE >= 34
         if (y <= 17) {
-           fp_mul_comba17(A,B,C);
+           ret = fp_mul_comba17(A,B,C);
            goto clean;
         }
 #endif
 
 #if defined(TFM_SMALL_SET) && FP_SIZE >= 32
         if (y <= 16) {
-           fp_mul_comba_small(A,B,C);
+           ret = fp_mul_comba_small(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL20) && FP_SIZE >= 40
         if (y <= 20) {
-           fp_mul_comba20(A,B,C);
+           ret = fp_mul_comba20(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL24) && FP_SIZE >= 48
         if (yy >= 16 && y <= 24) {
-           fp_mul_comba24(A,B,C);
+           ret = fp_mul_comba24(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL28) && FP_SIZE >= 56
         if (yy >= 20 && y <= 28) {
-           fp_mul_comba28(A,B,C);
+           ret = fp_mul_comba28(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL32) && FP_SIZE >= 64
         if (yy >= 24 && y <= 32) {
-           fp_mul_comba32(A,B,C);
+           ret = fp_mul_comba32(A,B,C);
            goto clean;
         }
 #endif
 #if defined(TFM_MUL48) && FP_SIZE >= 96
         if (yy >= 40 && y <= 48) {
-          fp_mul_comba48(A,B,C);
+          ret = fp_mul_comba48(A,B,C);
           goto clean;
         }
 #endif
 #if defined(TFM_MUL64) && FP_SIZE >= 128
         if (yy >= 56 && y <= 64) {
-           fp_mul_comba64(A,B,C);
+           ret = fp_mul_comba64(A,B,C);
            goto clean;
         }
 #endif
-        fp_mul_comba(A,B,C);
+        ret = fp_mul_comba(A,B,C);
 
 clean:
     /* zero any excess digits on the destination that we didn't write to */
     for (y = C->used; y >= 0 && y < oldused; y++) {
         C->dp[y] = 0;
     }
+
+    return ret;
 }
 
 void fp_mul_2(fp_int * a, fp_int * b)
@@ -431,11 +434,22 @@ void fp_mul_2d(fp_int *a, int b, fp_int *c)
 /* generic PxQ multiplier */
 #if defined(HAVE_INTEL_MULX)
 
-WC_INLINE static void fp_mul_comba_mulx(fp_int *A, fp_int *B, fp_int *C)
+WC_INLINE static int fp_mul_comba_mulx(fp_int *A, fp_int *B, fp_int *C)
 
 {
    int       ix, iy, iz, pa;
-   fp_int    tmp, *dst;
+   fp_int    *dst;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int    tmp[1];
+#else
+   fp_int    *tmp;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   tmp = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (tmp == NULL)
+       return FP_MEM;
+#endif
 
    /* get size of output and trim */
    pa = A->used + B->used;
@@ -446,8 +460,8 @@ WC_INLINE static void fp_mul_comba_mulx(fp_int *A, fp_int *B, fp_int *C)
    /* Always take branch to use tmp variable. This avoids a cache attack for
     * determining if C equals A */
    if (1) {
-      fp_init(&tmp);
-      dst = &tmp;
+      fp_init(tmp);
+      dst = tmp;
    }
 
    TFM_INTEL_MUL_COMBA(A, B, dst) ;
@@ -456,16 +470,34 @@ WC_INLINE static void fp_mul_comba_mulx(fp_int *A, fp_int *B, fp_int *C)
   dst->sign = A->sign ^ B->sign;
   fp_clamp(dst);
   fp_copy(dst, C);
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(tmp, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
+  return FP_OKAY;
 }
 #endif
 
-void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
+int fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
 {
+   int       ret = 0;
    int       ix, iy, iz, tx, ty, pa;
    fp_digit  c0, c1, c2, *tmpx, *tmpy;
-   fp_int    tmp, *dst;
+   fp_int    *dst;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int    tmp[1];
+#else
+   fp_int    *tmp;
+#endif
 
-   IF_HAVE_INTEL_MULX(fp_mul_comba_mulx(A, B, C), return) ;
+   IF_HAVE_INTEL_MULX(ret = fp_mul_comba_mulx(A, B, C), return ret) ;
+
+#ifdef WOLFSSL_SMALL_STACK
+   tmp = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (tmp == NULL)
+       return FP_MEM;
+#endif
 
    COMBA_START;
    COMBA_CLEAR;
@@ -479,8 +511,8 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
    /* Always take branch to use tmp variable. This avoids a cache attack for
     * determining if C equals A */
    if (1) {
-      fp_init(&tmp);
-      dst = &tmp;
+      fp_init(tmp);
+      dst = tmp;
    }
 
    for (ix = 0; ix < pa; ix++) {
@@ -514,13 +546,22 @@ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
   dst->sign = A->sign ^ B->sign;
   fp_clamp(dst);
   fp_copy(dst, C);
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(tmp, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+  return ret;
 }
 
 /* a/b => cb + d == a */
 int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
-  fp_int  q, x, y, t1, t2;
   int     n, t, i, norm, neg;
+#ifndef WOLFSSL_SMALL_STACK
+  fp_int  q[1], x[1], y[1], t1[1], t2[1];
+#else
+  fp_int  *q, *x, *y, *t1, *t2;
+#endif
 
   /* is divisor zero ? */
   if (fp_iszero (b) == FP_YES) {
@@ -538,59 +579,67 @@ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
     return FP_OKAY;
   }
 
-  fp_init(&q);
-  q.used = a->used + 2;
+#ifdef WOLFSSL_SMALL_STACK
+  q = (fp_int*)XMALLOC(sizeof(fp_int) * 5, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (q == NULL) {
+      return FP_MEM;
+  }
+  x = &q[1]; y = &q[2]; t1 = &q[3]; t2 = &q[4];
+#endif
 
-  fp_init(&t1);
-  fp_init(&t2);
-  fp_init_copy(&x, a);
-  fp_init_copy(&y, b);
+  fp_init(q);
+  q->used = a->used + 2;
+
+  fp_init(t1);
+  fp_init(t2);
+  fp_init_copy(x, a);
+  fp_init_copy(y, b);
 
   /* fix the sign */
   neg = (a->sign == b->sign) ? FP_ZPOS : FP_NEG;
-  x.sign = y.sign = FP_ZPOS;
+  x->sign = y->sign = FP_ZPOS;
 
   /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
-  norm = fp_count_bits(&y) % DIGIT_BIT;
+  norm = fp_count_bits(y) % DIGIT_BIT;
   if (norm < (int)(DIGIT_BIT-1)) {
      norm = (DIGIT_BIT-1) - norm;
-     fp_mul_2d (&x, norm, &x);
-     fp_mul_2d (&y, norm, &y);
+     fp_mul_2d (x, norm, x);
+     fp_mul_2d (y, norm, y);
   } else {
      norm = 0;
   }
 
   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
-  n = x.used - 1;
-  t = y.used - 1;
+  n = x->used - 1;
+  t = y->used - 1;
 
   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
-  fp_lshd (&y, n - t); /* y = y*b**{n-t} */
+  fp_lshd (y, n - t); /* y = y*b**{n-t} */
 
-  while (fp_cmp (&x, &y) != FP_LT) {
-    ++(q.dp[n - t]);
-    fp_sub (&x, &y, &x);
+  while (fp_cmp (x, y) != FP_LT) {
+    ++(q->dp[n - t]);
+    fp_sub (x, y, x);
   }
 
   /* reset y by shifting it back down */
-  fp_rshd (&y, n - t);
+  fp_rshd (y, n - t);
 
   /* step 3. for i from n down to (t + 1) */
   for (i = n; i >= (t + 1); i--) {
-    if (i > x.used) {
+    if (i > x->used) {
       continue;
     }
 
     /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
      * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
-    if (x.dp[i] == y.dp[t]) {
-      q.dp[i - t - 1] = (fp_digit) ((((fp_word)1) << DIGIT_BIT) - 1);
+    if (x->dp[i] == y->dp[t]) {
+      q->dp[i - t - 1] = (fp_digit) ((((fp_word)1) << DIGIT_BIT) - 1);
     } else {
       fp_word tmp;
-      tmp = ((fp_word) x.dp[i]) << ((fp_word) DIGIT_BIT);
-      tmp |= ((fp_word) x.dp[i - 1]);
-      tmp /= ((fp_word)y.dp[t]);
-      q.dp[i - t - 1] = (fp_digit) (tmp);
+      tmp = ((fp_word) x->dp[i]) << ((fp_word) DIGIT_BIT);
+      tmp |= ((fp_word) x->dp[i - 1]);
+      tmp /= ((fp_word)y->dp[t]);
+      q->dp[i - t - 1] = (fp_digit) (tmp);
     }
 
     /* while (q{i-t-1} * (yt * b + y{t-1})) >
@@ -598,35 +647,35 @@ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 
        do q{i-t-1} -= 1;
     */
-    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1);
+    q->dp[i - t - 1] = (q->dp[i - t - 1] + 1);
     do {
-      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1);
+      q->dp[i - t - 1] = (q->dp[i - t - 1] - 1);
 
       /* find left hand */
-      fp_zero (&t1);
-      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
-      t1.dp[1] = y.dp[t];
-      t1.used = 2;
-      fp_mul_d (&t1, q.dp[i - t - 1], &t1);
+      fp_zero (t1);
+      t1->dp[0] = (t - 1 < 0) ? 0 : y->dp[t - 1];
+      t1->dp[1] = y->dp[t];
+      t1->used = 2;
+      fp_mul_d (t1, q->dp[i - t - 1], t1);
 
       /* find right hand */
-      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
-      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
-      t2.dp[2] = x.dp[i];
-      t2.used = 3;
-    } while (fp_cmp_mag(&t1, &t2) == FP_GT);
+      t2->dp[0] = (i - 2 < 0) ? 0 : x->dp[i - 2];
+      t2->dp[1] = (i - 1 < 0) ? 0 : x->dp[i - 1];
+      t2->dp[2] = x->dp[i];
+      t2->used = 3;
+    } while (fp_cmp_mag(t1, t2) == FP_GT);
 
     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
-    fp_mul_d (&y, q.dp[i - t - 1], &t1);
-    fp_lshd  (&t1, i - t - 1);
-    fp_sub   (&x, &t1, &x);
+    fp_mul_d (y, q->dp[i - t - 1], t1);
+    fp_lshd  (t1, i - t - 1);
+    fp_sub   (x, t1, x);
 
     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
-    if (x.sign == FP_NEG) {
-      fp_copy (&y, &t1);
-      fp_lshd (&t1, i - t - 1);
-      fp_add (&x, &t1, &x);
-      q.dp[i - t - 1] = q.dp[i - t - 1] - 1;
+    if (x->sign == FP_NEG) {
+      fp_copy (y, t1);
+      fp_lshd (t1, i - t - 1);
+      fp_add (x, t1, x);
+      q->dp[i - t - 1] = q->dp[i - t - 1] - 1;
     }
   }
 
@@ -635,25 +684,28 @@ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
    */
 
   /* get sign before writing to c */
-  x.sign = x.used == 0 ? FP_ZPOS : a->sign;
+  x->sign = x->used == 0 ? FP_ZPOS : a->sign;
 
   if (c != NULL) {
-    fp_clamp (&q);
-    fp_copy (&q, c);
+    fp_clamp (q);
+    fp_copy (q, c);
     c->sign = neg;
   }
 
   if (d != NULL) {
-    fp_div_2d (&x, norm, &x, NULL);
+    fp_div_2d (x, norm, x, NULL);
 
     /* zero any excess digits on the destination that we didn't write to */
-    for (i = b->used; i < x.used; i++) {
-        x.dp[i] = 0;
+    for (i = b->used; i < x->used; i++) {
+        x->dp[i] = 0;
     }
-    fp_clamp(&x);
-    fp_copy (&x, d);
+    fp_clamp(x);
+    fp_copy (x, d);
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(q, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
   return FP_OKAY;
 }
 
@@ -700,7 +752,6 @@ void fp_div_2(fp_int * a, fp_int * b)
 void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
 {
   int      D;
-  fp_int   t;
 
   /* if the shift count is <= 0 then we do no work */
   if (b <= 0) {
@@ -711,11 +762,9 @@ void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
     return;
   }
 
-  fp_init(&t);
-
-  /* get the remainder */
-  if (d != NULL) {
-    fp_mod_2d (a, b, &t);
+  /* get the remainder before a is changed in calculating c */
+  if (a == c && d != NULL) {
+    fp_mod_2d (a, b, d);
   }
 
   /* copy */
@@ -731,28 +780,45 @@ void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
   if (D != 0) {
     fp_rshb(c, D);
   }
-  fp_clamp (c);
-  if (d != NULL) {
-    fp_copy (&t, d);
+
+  /* get the remainder if a is not changed in calculating c */
+  if (a != c && d != NULL) {
+    fp_mod_2d (a, b, d);
   }
+
+  fp_clamp (c);
 }
 
 /* c = a mod b, 0 <= c < b  */
 int fp_mod(fp_int *a, fp_int *b, fp_int *c)
 {
-   fp_int t;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int t[1];
+#else
+   fp_int *t;
+#endif
    int    err;
 
-   fp_init(&t);
-   if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
-      return err;
-   }
-   if (t.sign != b->sign) {
-      fp_add(&t, b, c);
-   } else {
-      fp_copy(&t, c);
+#ifdef WOLFSSL_SMALL_STACK
+   t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (t == NULL)
+       return FP_MEM;
+#endif
+
+   fp_init(t);
+   err = fp_div(a, b, NULL, t);
+   if (err == FP_OKAY) {
+      if (t->sign != b->sign) {
+         fp_add(t, b, c);
+      } else {
+         fp_copy(t, c);
+     }
   }
-  return FP_OKAY;
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+  return err;
 }
 
 /* c = a mod 2**d */
@@ -785,113 +851,141 @@ void fp_mod_2d(fp_int *a, int b, fp_int *c)
 
 static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
 {
-  fp_int  x, y, u, v, A, B, C, D;
-  int     res;
+#ifndef WOLFSSL_SMALL_STACK
+  fp_int  x[1], y[1], u[1], v[1], A[1], B[1], C[1], D[1];
+#else
+  fp_int  *x, *y, *u, *v, *A, *B, *C, *D;
+#endif
+  int     err;
 
   /* b cannot be negative */
   if (b->sign == FP_NEG || fp_iszero(b) == FP_YES) {
     return FP_VAL;
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  x = (fp_int*)XMALLOC(sizeof(fp_int) * 8, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (x == NULL) {
+      return FP_MEM;
+  }
+  y = &x[1]; u = &x[2]; v = &x[3]; A = &x[4]; B = &x[5]; C = &x[6]; D = &x[7];
+#endif
+
   /* init temps */
-  fp_init(&x);    fp_init(&y);
-  fp_init(&u);    fp_init(&v);
-  fp_init(&A);    fp_init(&B);
-  fp_init(&C);    fp_init(&D);
+  fp_init(x);    fp_init(y);
+  fp_init(u);    fp_init(v);
+  fp_init(A);    fp_init(B);
+  fp_init(C);    fp_init(D);
 
   /* x = a, y = b */
-  if ((res = fp_mod(a, b, &x)) != FP_OKAY) {
-      return res;
+  if ((err = fp_mod(a, b, x)) != FP_OKAY) {
+  #ifdef WOLFSSL_SMALL_STACK
+    XFREE(x, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  #endif
+    return err;
   }
-  fp_copy(b, &y);
+  fp_copy(b, y);
 
   /* 2. [modified] if x,y are both even then return an error! */
-  if (fp_iseven (&x) == FP_YES && fp_iseven (&y) == FP_YES) {
+  if (fp_iseven (x) == FP_YES && fp_iseven (y) == FP_YES) {
+  #ifdef WOLFSSL_SMALL_STACK
+    XFREE(x, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  #endif
     return FP_VAL;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  fp_copy (&x, &u);
-  fp_copy (&y, &v);
-  fp_set (&A, 1);
-  fp_set (&D, 1);
+  fp_copy (x, u);
+  fp_copy (y, v);
+  fp_set (A, 1);
+  fp_set (D, 1);
 
 top:
   /* 4.  while u is even do */
-  while (fp_iseven (&u) == FP_YES) {
+  while (fp_iseven (u) == FP_YES) {
     /* 4.1 u = u/2 */
-    fp_div_2 (&u, &u);
+    fp_div_2 (u, u);
 
     /* 4.2 if A or B is odd then */
-    if (fp_isodd (&A) == FP_YES || fp_isodd (&B) == FP_YES) {
+    if (fp_isodd (A) == FP_YES || fp_isodd (B) == FP_YES) {
       /* A = (A+y)/2, B = (B-x)/2 */
-      fp_add (&A, &y, &A);
-      fp_sub (&B, &x, &B);
+      fp_add (A, y, A);
+      fp_sub (B, x, B);
     }
     /* A = A/2, B = B/2 */
-    fp_div_2 (&A, &A);
-    fp_div_2 (&B, &B);
+    fp_div_2 (A, A);
+    fp_div_2 (B, B);
   }
 
   /* 5.  while v is even do */
-  while (fp_iseven (&v) == FP_YES) {
+  while (fp_iseven (v) == FP_YES) {
     /* 5.1 v = v/2 */
-    fp_div_2 (&v, &v);
+    fp_div_2 (v, v);
 
     /* 5.2 if C or D is odd then */
-    if (fp_isodd (&C) == FP_YES || fp_isodd (&D) == FP_YES) {
+    if (fp_isodd (C) == FP_YES || fp_isodd (D) == FP_YES) {
       /* C = (C+y)/2, D = (D-x)/2 */
-      fp_add (&C, &y, &C);
-      fp_sub (&D, &x, &D);
+      fp_add (C, y, C);
+      fp_sub (D, x, D);
     }
     /* C = C/2, D = D/2 */
-    fp_div_2 (&C, &C);
-    fp_div_2 (&D, &D);
+    fp_div_2 (C, C);
+    fp_div_2 (D, D);
   }
 
   /* 6.  if u >= v then */
-  if (fp_cmp (&u, &v) != FP_LT) {
+  if (fp_cmp (u, v) != FP_LT) {
     /* u = u - v, A = A - C, B = B - D */
-    fp_sub (&u, &v, &u);
-    fp_sub (&A, &C, &A);
-    fp_sub (&B, &D, &B);
+    fp_sub (u, v, u);
+    fp_sub (A, C, A);
+    fp_sub (B, D, B);
   } else {
     /* v - v - u, C = C - A, D = D - B */
-    fp_sub (&v, &u, &v);
-    fp_sub (&C, &A, &C);
-    fp_sub (&D, &B, &D);
+    fp_sub (v, u, v);
+    fp_sub (C, A, C);
+    fp_sub (D, B, D);
   }
 
   /* if not zero goto step 4 */
-  if (fp_iszero (&u) == FP_NO)
+  if (fp_iszero (u) == FP_NO)
     goto top;
 
   /* now a = C, b = D, gcd == g*v */
 
   /* if v != 1 then there is no inverse */
-  if (fp_cmp_d (&v, 1) != FP_EQ) {
+  if (fp_cmp_d (v, 1) != FP_EQ) {
+  #ifdef WOLFSSL_SMALL_STACK
+    XFREE(x, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  #endif
     return FP_VAL;
   }
 
   /* if its too low */
-  while (fp_cmp_d(&C, 0) == FP_LT) {
-      fp_add(&C, b, &C);
+  while (fp_cmp_d(C, 0) == FP_LT) {
+      fp_add(C, b, C);
   }
 
   /* too big */
-  while (fp_cmp_mag(&C, b) != FP_LT) {
-      fp_sub(&C, b, &C);
+  while (fp_cmp_mag(C, b) != FP_LT) {
+      fp_sub(C, b, C);
   }
 
   /* C is now the inverse */
-  fp_copy(&C, c);
+  fp_copy(C, c);
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(x, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
   return FP_OKAY;
 }
 
 /* c = 1/a (mod b) for odd b only */
 int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
 {
-  fp_int  x, y, u, v, B, D;
+#ifndef WOLFSSL_SMALL_STACK
+  fp_int  x[1], y[1], u[1], v[1], B[1], D[1];
+#else
+  fp_int  *x, *y, *u, *v, *B, *D;
+#endif
   int     neg;
 
   /* 2. [modified] b must be odd   */
@@ -899,84 +993,98 @@ int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
     return fp_invmod_slow(a,b,c);
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  x = (fp_int*)XMALLOC(sizeof(fp_int) * 6, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (x == NULL) {
+      return FP_MEM;
+  }
+  y = &x[1]; u = &x[2]; v = &x[3]; B = &x[4]; D = &x[5];
+#endif
+
   /* init all our temps */
-  fp_init(&x);  fp_init(&y);
-  fp_init(&u);  fp_init(&v);
-  fp_init(&B);  fp_init(&D);
+  fp_init(x);  fp_init(y);
+  fp_init(u);  fp_init(v);
+  fp_init(B);  fp_init(D);
 
   /* x == modulus, y == value to invert */
-  fp_copy(b, &x);
+  fp_copy(b, x);
 
   /* we need y = |a| */
-  fp_abs(a, &y);
+  fp_abs(a, y);
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  fp_copy(&x, &u);
-  fp_copy(&y, &v);
-  fp_set (&D, 1);
+  fp_copy(x, u);
+  fp_copy(y, v);
+  fp_set (D, 1);
 
 top:
   /* 4.  while u is even do */
-  while (fp_iseven (&u) == FP_YES) {
+  while (fp_iseven (u) == FP_YES) {
     /* 4.1 u = u/2 */
-    fp_div_2 (&u, &u);
+    fp_div_2 (u, u);
 
     /* 4.2 if B is odd then */
-    if (fp_isodd (&B) == FP_YES) {
-      fp_sub (&B, &x, &B);
+    if (fp_isodd (B) == FP_YES) {
+      fp_sub (B, x, B);
     }
     /* B = B/2 */
-    fp_div_2 (&B, &B);
+    fp_div_2 (B, B);
   }
 
   /* 5.  while v is even do */
-  while (fp_iseven (&v) == FP_YES) {
+  while (fp_iseven (v) == FP_YES) {
     /* 5.1 v = v/2 */
-    fp_div_2 (&v, &v);
+    fp_div_2 (v, v);
 
     /* 5.2 if D is odd then */
-    if (fp_isodd (&D) == FP_YES) {
+    if (fp_isodd (D) == FP_YES) {
       /* D = (D-x)/2 */
-      fp_sub (&D, &x, &D);
+      fp_sub (D, x, D);
     }
     /* D = D/2 */
-    fp_div_2 (&D, &D);
+    fp_div_2 (D, D);
   }
 
   /* 6.  if u >= v then */
-  if (fp_cmp (&u, &v) != FP_LT) {
+  if (fp_cmp (u, v) != FP_LT) {
     /* u = u - v, B = B - D */
-    fp_sub (&u, &v, &u);
-    fp_sub (&B, &D, &B);
+    fp_sub (u, v, u);
+    fp_sub (B, D, B);
   } else {
     /* v - v - u, D = D - B */
-    fp_sub (&v, &u, &v);
-    fp_sub (&D, &B, &D);
+    fp_sub (v, u, v);
+    fp_sub (D, B, D);
   }
 
   /* if not zero goto step 4 */
-  if (fp_iszero (&u) == FP_NO) {
+  if (fp_iszero (u) == FP_NO) {
     goto top;
   }
 
   /* now a = C, b = D, gcd == g*v */
 
   /* if v != 1 then there is no inverse */
-  if (fp_cmp_d (&v, 1) != FP_EQ) {
+  if (fp_cmp_d (v, 1) != FP_EQ) {
+  #ifdef WOLFSSL_SMALL_STACK
+    XFREE(x, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  #endif
     return FP_VAL;
   }
 
   /* b is now the inverse */
   neg = a->sign;
-  while (D.sign == FP_NEG) {
-    fp_add (&D, b, &D);
+  while (D->sign == FP_NEG) {
+    fp_add (D, b, D);
   }
   /* too big */
-  while (fp_cmp_mag(&D, b) != FP_LT) {
-    fp_sub(&D, b, &D);
+  while (fp_cmp_mag(D, b) != FP_LT) {
+    fp_sub(D, b, D);
   }
-  fp_copy (&D, c);
+  fp_copy (D, c);
   c->sign = neg;
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(x, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
   return FP_OKAY;
 }
 
@@ -984,20 +1092,35 @@ top:
 int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
   int err;
-  fp_int t;
-
-  fp_init(&t);
-  fp_mul(a, b, &t);
-#if defined(ALT_ECC_SIZE) || defined(HAVE_WOLF_BIGINT)
-  if (d->size < FP_SIZE) {
-    err = fp_mod(&t, c, &t);
-    fp_copy(&t, d);
-  } else
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int t[1];
+#else
+   fp_int *t;
 #endif
-  {
-    err = fp_mod(&t, c, d);
+
+#ifdef WOLFSSL_SMALL_STACK
+   t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (t == NULL)
+       return FP_MEM;
+#endif
+
+  fp_init(t);
+  err = fp_mul(a, b, t);
+  if (err == FP_OKAY) {
+  #if defined(ALT_ECC_SIZE) || defined(HAVE_WOLF_BIGINT)
+    if (d->size < FP_SIZE) {
+      err = fp_mod(t, c, t);
+      fp_copy(t, d);
+    } else
+  #endif
+    {
+      err = fp_mod(t, c, d);
+    }
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
   return err;
 }
 
@@ -1005,20 +1128,33 @@ int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
   int err;
-  fp_int t;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int t[1];
+#else
+   fp_int *t;
+#endif
 
-  fp_init(&t);
-  fp_sub(a, b, &t);
+#ifdef WOLFSSL_SMALL_STACK
+   t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (t == NULL)
+       return FP_MEM;
+#endif
+
+  fp_init(t);
+  fp_sub(a, b, t);
 #if defined(ALT_ECC_SIZE) || defined(HAVE_WOLF_BIGINT)
   if (d->size < FP_SIZE) {
-    err = fp_mod(&t, c, &t);
-    fp_copy(&t, d);
+    err = fp_mod(t, c, t);
+    fp_copy(t, d);
   } else
 #endif
   {
-    err = fp_mod(&t, c, d);
+    err = fp_mod(t, c, d);
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
   return err;
 }
 
@@ -1026,20 +1162,33 @@ int fp_submod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 {
   int err;
-  fp_int t;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int t[1];
+#else
+   fp_int *t;
+#endif
 
-  fp_init(&t);
-  fp_add(a, b, &t);
+#ifdef WOLFSSL_SMALL_STACK
+   t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (t == NULL)
+       return FP_MEM;
+#endif
+
+  fp_init(t);
+  fp_add(a, b, t);
 #if defined(ALT_ECC_SIZE) || defined(HAVE_WOLF_BIGINT)
   if (d->size < FP_SIZE) {
-    err = fp_mod(&t, c, &t);
-    fp_copy(&t, d);
+    err = fp_mod(t, c, t);
+    fp_copy(t, d);
   } else
 #endif
   {
-    err = fp_mod(&t, c, d);
+    err = fp_mod(t, c, d);
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
   return err;
 }
 
@@ -1051,10 +1200,14 @@ int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 */
 static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
+#ifndef WOLFSSL_SMALL_STACK
 #ifdef WC_NO_CACHE_RESISTANT
   fp_int   R[2];
 #else
   fp_int   R[3];   /* need a temp for cache resistance */
+#endif
+#else
+   fp_int  *R;
 #endif
   fp_digit buf, mp;
   int      err, bitcnt, digidx, y;
@@ -1064,6 +1217,15 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
      return err;
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+#ifndef WC_NO_CACHE_RESISTANT
+   R = (fp_int*)XMALLOC(sizeof(fp_int) * 3, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#else
+   R = (fp_int*)XMALLOC(sizeof(fp_int) * 2, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   if (R == NULL)
+       return FP_MEM;
+#endif
   fp_init(&R[0]);
   fp_init(&R[1]);
 #ifndef WC_NO_CACHE_RESISTANT
@@ -1108,10 +1270,36 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
     buf <<= (fp_digit)1;
 
     /* do ops */
-    fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
+    err = fp_mul(&R[0], &R[1], &R[y^1]);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(R, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
+    err = fp_montgomery_reduce(&R[y^1], P, mp);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(R, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
 
 #ifdef WC_NO_CACHE_RESISTANT
-    fp_sqr(&R[y], &R[y]);          fp_montgomery_reduce(&R[y], P, mp);
+    err = fp_sqr(&R[y], &R[y]);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(R, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
+    err = fp_montgomery_reduce(&R[y], P, mp);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(R, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
 #else
     /* instead of using R[y] for sqr, which leaks key bit to cache monitor,
      * use R[2] as temp, make sure address calc is constant, keep
@@ -1119,16 +1307,32 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
     fp_copy((fp_int*) ( ((wolfssl_word)&R[0] & wc_off_on_addr[y^1]) +
                         ((wolfssl_word)&R[1] & wc_off_on_addr[y]) ),
             &R[2]);
-    fp_sqr(&R[2], &R[2]);          fp_montgomery_reduce(&R[2], P, mp);
+    err = fp_sqr(&R[2], &R[2]);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(R, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
+    err = fp_montgomery_reduce(&R[2], P, mp);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(R, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
     fp_copy(&R[2],
             (fp_int*) ( ((wolfssl_word)&R[0] & wc_off_on_addr[y^1]) +
                         ((wolfssl_word)&R[1] & wc_off_on_addr[y]) ) );
 #endif /* WC_NO_CACHE_RESISTANT */
   }
 
-   fp_montgomery_reduce(&R[0], P, mp);
+   err = fp_montgomery_reduce(&R[0], P, mp);
    fp_copy(&R[0], Y);
-   return FP_OKAY;
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(R, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return err;
 }
 
 #else /* TFM_TIMING_RESISTANT */
@@ -1138,12 +1342,13 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  */
 static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
-  fp_int   res;
   fp_digit buf, mp;
   int      err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
 #ifdef WOLFSSL_SMALL_STACK
+  fp_int  *res;
   fp_int  *M;
 #else
+  fp_int   res[1];
   fp_int   M[64];
 #endif
 
@@ -1167,11 +1372,12 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
   }
 
 #ifdef WOLFSSL_SMALL_STACK
-  /* only allocate space for what's needed */
-  M = (fp_int*)XMALLOC(sizeof(fp_int)*(1 << winsize), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  /* only allocate space for what's needed for window plus res */
+  M = (fp_int*)XMALLOC(sizeof(fp_int)*((1 << winsize) + 1), NULL, DYNAMIC_TYPE_TMP_BUFFER);
   if (M == NULL) {
      return FP_MEM;
   }
+  res = &M[1 << winsize];
 #endif
 
   /* init M array */
@@ -1179,7 +1385,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
     fp_init(&M[x]);
 
   /* setup result */
-  fp_init(&res);
+  fp_init(res);
 
   /* create M table
    *
@@ -1189,7 +1395,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
    */
 
    /* now we need R mod m */
-   fp_montgomery_calc_normalization (&res, P);
+   fp_montgomery_calc_normalization (res, P);
 
    /* now set M[1] to G * R mod m */
    if (fp_cmp_mag(P, G) != FP_GT) {
@@ -1198,20 +1404,38 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
    } else {
       fp_copy(G, &M[1]);
    }
-   fp_mulmod (&M[1], &res, P, &M[1]);
+   fp_mulmod (&M[1], res, P, &M[1]);
 
   /* compute the value at M[1<<(winsize-1)] by
    * squaring M[1] (winsize-1) times */
   fp_copy (&M[1], &M[1 << (winsize - 1)]);
   for (x = 0; x < (winsize - 1); x++) {
     fp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)]);
-    fp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp);
+    err = fp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
   }
 
   /* create upper table */
   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    fp_mul(&M[x - 1], &M[1], &M[x]);
-    fp_montgomery_reduce(&M[x], P, mp);
+    err = fp_mul(&M[x - 1], &M[1], &M[x]);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
+    err = fp_montgomery_reduce(&M[x], P, mp);
+    if (err != FP_OKAY) {
+    #ifdef WOLFSSL_SMALL_STACK
+      XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+      return err;
+    }
   }
 
   /* set initial mode and bit cnt */
@@ -1249,8 +1473,20 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
-      fp_sqr(&res, &res);
-      fp_montgomery_reduce(&res, P, mp);
+      err = fp_sqr(res, res);
+      if (err != FP_OKAY) {
+      #ifdef WOLFSSL_SMALL_STACK
+        XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+      #endif
+        return err;
+      }
+      fp_montgomery_reduce(res, P, mp);
+      if (err != FP_OKAY) {
+      #ifdef WOLFSSL_SMALL_STACK
+        XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+      #endif
+        return err;
+      }
       continue;
     }
 
@@ -1262,13 +1498,37 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
       /* ok window is filled so square as required and multiply  */
       /* square first */
       for (x = 0; x < winsize; x++) {
-        fp_sqr(&res, &res);
-        fp_montgomery_reduce(&res, P, mp);
+        err = fp_sqr(res, res);
+        if (err != FP_OKAY) {
+        #ifdef WOLFSSL_SMALL_STACK
+          XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        #endif
+          return err;
+        }
+        err = fp_montgomery_reduce(res, P, mp);
+        if (err != FP_OKAY) {
+        #ifdef WOLFSSL_SMALL_STACK
+          XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        #endif
+          return err;
+        }
       }
 
       /* then multiply */
-      fp_mul(&res, &M[bitbuf], &res);
-      fp_montgomery_reduce(&res, P, mp);
+      err = fp_mul(res, &M[bitbuf], res);
+      if (err != FP_OKAY) {
+      #ifdef WOLFSSL_SMALL_STACK
+        XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+      #endif
+        return err;
+      }
+      err = fp_montgomery_reduce(res, P, mp);
+      if (err != FP_OKAY) {
+      #ifdef WOLFSSL_SMALL_STACK
+        XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+      #endif
+        return err;
+      }
 
       /* empty window and reset */
       bitcpy = 0;
@@ -1281,15 +1541,39 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
   if (mode == 2 && bitcpy > 0) {
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
-      fp_sqr(&res, &res);
-      fp_montgomery_reduce(&res, P, mp);
+      err = fp_sqr(res, res);
+      if (err != FP_OKAY) {
+      #ifdef WOLFSSL_SMALL_STACK
+        XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+      #endif
+        return err;
+      }
+      err = fp_montgomery_reduce(res, P, mp);
+      if (err != FP_OKAY) {
+      #ifdef WOLFSSL_SMALL_STACK
+        XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+      #endif
+        return err;
+      }
 
       /* get next bit of the window */
       bitbuf <<= 1;
       if ((bitbuf & (1 << winsize)) != 0) {
         /* then multiply */
-        fp_mul(&res, &M[1], &res);
-        fp_montgomery_reduce(&res, P, mp);
+        err = fp_mul(res, &M[1], res);
+        if (err != FP_OKAY) {
+        #ifdef WOLFSSL_SMALL_STACK
+          XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        #endif
+          return err;
+        }
+        err = fp_montgomery_reduce(res, P, mp);
+        if (err != FP_OKAY) {
+        #ifdef WOLFSSL_SMALL_STACK
+          XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        #endif
+          return err;
+        }
       }
     }
   }
@@ -1300,16 +1584,15 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
    * to reduce one more time to cancel out the factor
    * of R.
    */
-  fp_montgomery_reduce(&res, P, mp);
+  err = fp_montgomery_reduce(res, P, mp);
 
   /* swap res with Y */
-  fp_copy (&res, Y);
+  fp_copy (res, Y);
 
 #ifdef WOLFSSL_SMALL_STACK
   XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
 #endif
-
-  return FP_OKAY;
+  return err;
 }
 
 #endif /* TFM_TIMING_RESISTANT */
@@ -1324,18 +1607,31 @@ int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
    if (X->sign == FP_NEG) {
 #ifndef POSITIVE_EXP_ONLY  /* reduce stack if assume no negatives */
       int    err;
-      fp_int tmp;
+   #ifndef WOLFSSL_SMALL_STACK
+      fp_int tmp[1];
+   #else
+      fp_int *tmp;
+   #endif
+
+   #ifdef WOLFSSL_SMALL_STACK
+      tmp = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+      if (tmp == NULL)
+          return FP_MEM;
+   #endif
 
       /* yes, copy G and invmod it */
-      fp_init_copy(&tmp, G);
-      if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
-         return err;
-      }
-      X->sign = FP_ZPOS;
-      err =  _fp_exptmod(&tmp, X, P, Y);
-      if (X != Y) {
-         X->sign = FP_NEG;
+      fp_init_copy(tmp, G);
+      err = fp_invmod(tmp, P, tmp);
+      if (err == FP_OKAY) {
+         X->sign = FP_ZPOS;
+         err =  _fp_exptmod(tmp, X, P, Y);
+         if (X != Y) {
+            X->sign = FP_NEG;
+         }
       }
+   #ifdef WOLFSSL_SMALL_STACK
+      XFREE(tmp, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   #endif
       return err;
 #else
       return FP_VAL;
@@ -1372,8 +1668,9 @@ void fp_2expt(fp_int *a, int b)
 }
 
 /* b = a*a  */
-void fp_sqr(fp_int *A, fp_int *B)
+int fp_sqr(fp_int *A, fp_int *B)
 {
+    int err;
     int y, oldused;
 
     oldused = B->used;
@@ -1381,118 +1678,131 @@ void fp_sqr(fp_int *A, fp_int *B)
 
     /* call generic if we're out of range */
     if (y + y > FP_SIZE) {
-       fp_sqr_comba(A, B);
+       err = fp_sqr_comba(A, B);
        goto clean;
     }
 
 #if defined(TFM_SQR3) && FP_SIZE >= 6
         if (y <= 3) {
-           fp_sqr_comba3(A,B);
+           err = fp_sqr_comba3(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR4) && FP_SIZE >= 8
         if (y == 4) {
-           fp_sqr_comba4(A,B);
+           err = fp_sqr_comba4(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR6) && FP_SIZE >= 12
         if (y <= 6) {
-           fp_sqr_comba6(A,B);
+           err = fp_sqr_comba6(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR7) && FP_SIZE >= 14
         if (y == 7) {
-           fp_sqr_comba7(A,B);
+           err = fp_sqr_comba7(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR8) && FP_SIZE >= 16
         if (y == 8) {
-           fp_sqr_comba8(A,B);
+           err = fp_sqr_comba8(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR9) && FP_SIZE >= 18
         if (y == 9) {
-           fp_sqr_comba9(A,B);
+           err = fp_sqr_comba9(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR12) && FP_SIZE >= 24
         if (y <= 12) {
-           fp_sqr_comba12(A,B);
+           err = fp_sqr_comba12(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR17) && FP_SIZE >= 34
         if (y <= 17) {
-           fp_sqr_comba17(A,B);
+           err = fp_sqr_comba17(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SMALL_SET)
         if (y <= 16) {
-           fp_sqr_comba_small(A,B);
+           err = fp_sqr_comba_small(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR20) && FP_SIZE >= 40
         if (y <= 20) {
-           fp_sqr_comba20(A,B);
+           err = fp_sqr_comba20(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR24) && FP_SIZE >= 48
         if (y <= 24) {
-           fp_sqr_comba24(A,B);
+           err = fp_sqr_comba24(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR28) && FP_SIZE >= 56
         if (y <= 28) {
-           fp_sqr_comba28(A,B);
+           err = fp_sqr_comba28(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR32) && FP_SIZE >= 64
         if (y <= 32) {
-           fp_sqr_comba32(A,B);
+           err = fp_sqr_comba32(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR48) && FP_SIZE >= 96
         if (y <= 48) {
-           fp_sqr_comba48(A,B);
+           err = fp_sqr_comba48(A,B);
            goto clean;
         }
 #endif
 #if defined(TFM_SQR64) && FP_SIZE >= 128
         if (y <= 64) {
-           fp_sqr_comba64(A,B);
+           err = fp_sqr_comba64(A,B);
            goto clean;
         }
 #endif
-       fp_sqr_comba(A, B);
+       err = fp_sqr_comba(A, B);
 
 clean:
   /* zero any excess digits on the destination that we didn't write to */
   for (y = B->used; y >= 0 && y < oldused; y++) {
     B->dp[y] = 0;
   }
+
+  return err;
 }
 
 /* generic comba squarer */
-void fp_sqr_comba(fp_int *A, fp_int *B)
+int fp_sqr_comba(fp_int *A, fp_int *B)
 {
   int       pa, ix, iz;
   fp_digit  c0, c1, c2;
-  fp_int    tmp, *dst;
 #ifdef TFM_ISO
   fp_word   tt;
 #endif
+   fp_int    *dst;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int    tmp[1];
+#else
+   fp_int    *tmp;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   tmp = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (tmp == NULL)
+       return FP_MEM;
+#endif
 
   /* get size of output and trim */
   pa = A->used + A->used;
@@ -1505,8 +1815,8 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
   COMBA_CLEAR;
 
   if (A == B) {
-     fp_init(&tmp);
-     dst = &tmp;
+     fp_init(tmp);
+     dst = tmp;
   } else {
      fp_zero(B);
      dst = B;
@@ -1562,6 +1872,11 @@ void fp_sqr_comba(fp_int *A, fp_int *B)
   if (dst != B) {
      fp_copy(dst, B);
   }
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(tmp, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+  return FP_OKAY;
 }
 
 int fp_cmp(fp_int *a, fp_int *b)
@@ -1704,27 +2019,38 @@ static WC_INLINE void innermul8_mulx(fp_digit *c_mulx, fp_digit *cy_mulx, fp_dig
 }
 
 /* computes x/R == x (mod N) via Montgomery Reduction */
-static void fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
+static int fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
 {
-   fp_digit c[FP_SIZE+1], *_c, *tmpm, mu = 0;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit c[FP_SIZE+1];
+#else
+   fp_digit *c;
+#endif
+   fp_digit *_c, *tmpm, mu = 0;
    int      oldused, x, y, pa;
 
    /* bail if too large */
    if (m->used > (FP_SIZE/2)) {
       (void)mu;                     /* shut up compiler */
-      return;
+      return FP_OKAY;
    }
 
 #ifdef TFM_SMALL_MONT_SET
    if (m->used <= 16) {
-      fp_montgomery_reduce_small(a, m, mp);
-      return;
+      return fp_montgomery_reduce_small(a, m, mp);
    }
 #endif
 
+#ifdef WOLFSSL_SMALL_STACK
+   /* only allocate space for what's needed for window plus res */
+   c = (fp_digit*)XMALLOC(sizeof(fp_digit)*(FP_SIZE + 1), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (c == NULL) {
+      return FP_MEM;
+   }
+#endif
 
    /* now zero the buff */
-   XMEMSET(c, 0, sizeof(c));
+   XMEMSET(c, 0, sizeof(fp_digit)*(FP_SIZE + 1));
    pa = m->used;
 
    /* copy the input */
@@ -1778,33 +2104,50 @@ static void fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
   if (fp_cmp_mag (a, m) != FP_LT) {
     s_fp_sub (a, m, a);
   }
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(c, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+  return FP_OKAY;
 }
 #endif
 
 /* computes x/R == x (mod N) via Montgomery Reduction */
-void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
+int fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
 {
-   fp_digit c[FP_SIZE+1], *_c, *tmpm, mu = 0;
-   int      oldused, x, y, pa;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_digit c[FP_SIZE+1];
+#else
+   fp_digit *c;
+#endif
+   fp_digit *_c, *tmpm, mu = 0;
+   int      oldused, x, y, pa, err;
 
-   IF_HAVE_INTEL_MULX(fp_montgomery_reduce_mulx(a, m, mp), return) ;
+   IF_HAVE_INTEL_MULX(err = fp_montgomery_reduce_mulx(a, m, mp), return err) ;
+   (void)err;
 
    /* bail if too large */
    if (m->used > (FP_SIZE/2)) {
       (void)mu;                     /* shut up compiler */
-      return;
+      return FP_OKAY;
    }
 
 #ifdef TFM_SMALL_MONT_SET
    if (m->used <= 16) {
-      fp_montgomery_reduce_small(a, m, mp);
-      return;
+      return fp_montgomery_reduce_small(a, m, mp);
    }
 #endif
 
+#ifdef WOLFSSL_SMALL_STACK
+   /* only allocate space for what's needed for window plus res */
+   c = (fp_digit*)XMALLOC(sizeof(fp_digit)*(FP_SIZE + 1), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (c == NULL) {
+      return FP_MEM;
+   }
+#endif
 
    /* now zero the buff */
-   XMEMSET(c, 0, sizeof(c));
+   XMEMSET(c, 0, sizeof(fp_digit)*(FP_SIZE + 1));
    pa = m->used;
 
    /* copy the input */
@@ -1860,6 +2203,11 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
   if (fp_cmp_mag (a, m) != FP_LT) {
     s_fp_sub (a, m, a);
   }
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(c, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+  return FP_OKAY;
 }
 
 void fp_read_unsigned_bin(fp_int *a, const unsigned char *b, int c)
@@ -1953,15 +2301,30 @@ int fp_to_unsigned_bin_at_pos(int x, fp_int *t, unsigned char *b)
 #endif
 }
 
-void fp_to_unsigned_bin(fp_int *a, unsigned char *b)
+int fp_to_unsigned_bin(fp_int *a, unsigned char *b)
 {
   int     x;
-  fp_int  t;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int t[1];
+#else
+   fp_int *t;
+#endif
 
-  fp_init_copy(&t, a);
+#ifdef WOLFSSL_SMALL_STACK
+   t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (t == NULL)
+       return FP_MEM;
+#endif
 
-  x = fp_to_unsigned_bin_at_pos(0, &t, b);
+  fp_init_copy(t, a);
+
+  x = fp_to_unsigned_bin_at_pos(0, t, b);
   fp_reverse (b, x);
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+  return FP_OKAY; 
 }
 
 int fp_unsigned_bin_size(fp_int *a)
@@ -2194,20 +2557,36 @@ void fp_reverse (unsigned char *s, int len)
 
 
 /* c = a - b */
-void fp_sub_d(fp_int *a, fp_digit b, fp_int *c)
+int fp_sub_d(fp_int *a, fp_digit b, fp_int *c)
 {
-   fp_int tmp;
-   fp_init(&tmp);
-   fp_set(&tmp, b);
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int    tmp[1];
+#else
+   fp_int    *tmp;
+#endif
+
+#ifdef WOLFSSL_SMALL_STACK
+   tmp = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (tmp == NULL)
+       return FP_MEM;
+#endif
+
+   fp_init(tmp);
+   fp_set(tmp, b);
 #if defined(ALT_ECC_SIZE) || defined(HAVE_WOLF_BIGINT)
    if (c->size < FP_SIZE) {
-     fp_sub(a, &tmp, &tmp);
-     fp_copy(&tmp, c);
+     fp_sub(a, tmp, tmp);
+     fp_copy(tmp, c);
    } else
 #endif
    {
-     fp_sub(a, &tmp, c);
+     fp_sub(a, tmp, c);
    }
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(tmp, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 
 
@@ -2339,8 +2718,7 @@ int wolfcrypt_mp_mul(mp_int * a, mp_int * b, mp_int * c)
 int mp_mul (mp_int * a, mp_int * b, mp_int * c)
 #endif
 {
-  fp_mul(a, b, c);
-  return MP_OKAY;
+  return fp_mul(a, b, c);
 }
 
 int mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
@@ -2431,8 +2809,7 @@ int mp_to_unsigned_bin_at_pos(int x, fp_int *t, unsigned char *b)
 /* store in unsigned [big endian] format */
 int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 {
-  fp_to_unsigned_bin(a,b);
-  return MP_OKAY;
+  return fp_to_unsigned_bin(a,b);
 }
 
 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
@@ -2445,8 +2822,7 @@ int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
 
 int mp_sub_d(fp_int *a, fp_digit b, fp_int *c)
 {
-    fp_sub_d(a, b, c);
-    return MP_OKAY;
+    return fp_sub_d(a, b, c);
 }
 
 int mp_mul_2d(fp_int *a, int b, fp_int *c)
@@ -2571,22 +2947,36 @@ int mp_set_bit(mp_int *a, mp_digit b)
 int fp_sqrmod(fp_int *a, fp_int *b, fp_int *c)
 {
   int err;
-  fp_int t;
-
-  fp_init(&t);
-  fp_sqr(a, &t);
-
-#if defined(ALT_ECC_SIZE) || defined(HAVE_WOLF_BIGINT)
-  if (c->size < FP_SIZE) {
-    err = fp_mod(&t, b, &t);
-    fp_copy(&t, c);
-  }
-  else
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int t[1];
+#else
+   fp_int *t;
 #endif
-  {
-    err = fp_mod(&t, b, c);
+
+#ifdef WOLFSSL_SMALL_STACK
+   t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (t == NULL)
+       return FP_MEM;
+#endif
+
+  fp_init(t);
+  err = fp_sqr(a, t);
+  if (err == FP_OKAY) {
+  #if defined(ALT_ECC_SIZE) || defined(HAVE_WOLF_BIGINT)
+    if (c->size < FP_SIZE) {
+      err = fp_mod(t, b, t);
+      fp_copy(t, c);
+    }
+    else
+  #endif
+    {
+      err = fp_mod(t, b, c);
+    }
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
   return err;
 }
 
@@ -2613,13 +3003,28 @@ int mp_montgomery_calc_normalization(mp_int *a, mp_int *b)
 /* swap the elements of two integers, for cases where you can't simply swap the
  * mp_int pointers around
  */
-static void fp_exch (fp_int * a, fp_int * b)
+static int fp_exch (fp_int * a, fp_int * b)
 {
-    fp_int  t;
+#ifndef WOLFSSL_SMALL_STACK
+    fp_int  t[1];
+#else
+    fp_int *t;
+#endif
 
-    t  = *a;
+#ifdef WOLFSSL_SMALL_STACK
+   t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (t == NULL)
+       return FP_MEM;
+#endif
+
+    *t = *a;
     *a = *b;
-    *b = t;
+    *b = *t;
+
+#ifdef WOLFSSL_SMALL_STACK
+    XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+    return FP_OKAY;
 }
 #endif
 
@@ -2676,13 +3081,15 @@ static int s_is_power_of_two(fp_digit b, int *p)
 /* a/b => cb + d == a */
 static int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
 {
-  fp_int   q;
+#ifndef WOLFSSL_SMALL_STACK
+  fp_int   q[1];
+#else
+  fp_int   *q;
+#endif
   fp_word  w;
   fp_digit t;
   int      ix;
 
-  fp_init(&q);
-
   /* cannot divide by zero */
   if (b == 0) {
      return FP_VAL;
@@ -2710,9 +3117,17 @@ static int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
      return FP_OKAY;
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  q = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (q == NULL)
+      return FP_MEM;
+#endif
+
+  fp_init(q);
+
   if (c != NULL) {
-    q.used = a->used;
-    q.sign = a->sign;
+    q->used = a->used;
+    q->sign = a->sign;
   }
 
   w = 0;
@@ -2726,7 +3141,7 @@ static int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
         t = 0;
       }
       if (c != NULL)
-        q.dp[ix] = (fp_digit)t;
+        q->dp[ix] = (fp_digit)t;
   }
 
   if (d != NULL) {
@@ -2734,10 +3149,13 @@ static int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
   }
 
   if (c != NULL) {
-     fp_clamp(&q);
-     fp_copy(&q, c);
+     fp_clamp(q);
+     fp_copy(q, c);
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(q, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
   return FP_OKAY;
 }
 
@@ -2759,15 +3177,12 @@ int mp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
 #if !defined(NO_DH) || !defined(NO_DSA) || !defined(NO_RSA) || \
     defined(WOLFSSL_KEY_GEN)
 
-static int  fp_isprime_ex(fp_int *a, int t);
-/* static int  fp_isprime(fp_int *a); */
+static int  fp_isprime_ex(fp_int *a, int t, int* result);
 
 
 int mp_prime_is_prime(mp_int* a, int t, int* result)
 {
-    (void)t;
-    *result = fp_isprime_ex(a, t);
-    return MP_OKAY;
+    return fp_isprime_ex(a, t, result);
 }
 
 
@@ -2778,60 +3193,90 @@ int mp_prime_is_prime(mp_int* a, int t, int* result)
  * Randomly the chance of error is no more than 1/4 and often
  * very much lower.
  */
-static void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result)
+static int fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result)
 {
-  fp_int  n1, y, r;
+#ifndef WOLFSSL_SMALL_STACK
+  fp_int  n1[1], y[1], r[1];
+#else
+  fp_int  *n1, *y, *r;
+#endif
   int     s, j;
+  int     err;
 
   /* default */
   *result = FP_NO;
 
   /* ensure b > 1 */
   if (fp_cmp_d(b, 1) != FP_GT) {
-     return;
+     return FP_OKAY;
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  n1 = (fp_int*)XMALLOC(sizeof(fp_int) * 3, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (n1 == NULL) {
+      return FP_MEM;
+  }
+  y = &n1[1]; r = &n1[2];
+#endif
+
   /* get n1 = a - 1 */
-  fp_init_copy(&n1, a);
-  fp_sub_d(&n1, 1, &n1);
+  fp_init_copy(n1, a);
+  err = fp_sub_d(n1, 1, n1);
+  if (err != FP_OKAY) {
+  #ifdef WOLFSSL_SMALL_STACK
+     XFREE(n1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  #endif
+     return err;
+  }
 
   /* set 2**s * r = n1 */
-  fp_init_copy(&r, &n1);
+  fp_init_copy(r, n1);
 
   /* count the number of least significant bits
    * which are zero
    */
-  s = fp_cnt_lsb(&r);
+  s = fp_cnt_lsb(r);
 
   /* now divide n - 1 by 2**s */
-  fp_div_2d (&r, s, &r, NULL);
+  fp_div_2d (r, s, r, NULL);
 
   /* compute y = b**r mod a */
-  fp_init(&y);
-  fp_exptmod(b, &r, a, &y);
+  fp_init(y);
+  fp_exptmod(b, r, a, y);
 
   /* if y != 1 and y != n1 do */
-  if (fp_cmp_d (&y, 1) != FP_EQ && fp_cmp (&y, &n1) != FP_EQ) {
+  if (fp_cmp_d (y, 1) != FP_EQ && fp_cmp (y, n1) != FP_EQ) {
     j = 1;
     /* while j <= s-1 and y != n1 */
-    while ((j <= (s - 1)) && fp_cmp (&y, &n1) != FP_EQ) {
-      fp_sqrmod (&y, a, &y);
+    while ((j <= (s - 1)) && fp_cmp (y, n1) != FP_EQ) {
+      fp_sqrmod (y, a, y);
 
       /* if y == 1 then composite */
-      if (fp_cmp_d (&y, 1) == FP_EQ) {
-         return;
+      if (fp_cmp_d (y, 1) == FP_EQ) {
+      #ifdef WOLFSSL_SMALL_STACK
+         XFREE(n1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+      #endif
+         return FP_OKAY;
       }
       ++j;
     }
 
     /* if y != n1 then composite */
-    if (fp_cmp (&y, &n1) != FP_EQ) {
-       return;
+    if (fp_cmp (y, n1) != FP_EQ) {
+    #ifdef WOLFSSL_SMALL_STACK
+       XFREE(n1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+       return FP_OKAY;
     }
   }
 
   /* probably prime now */
   *result = FP_YES;
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(n1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+  return FP_OKAY;
 }
 
 
@@ -2874,9 +3319,13 @@ static const fp_digit primes[FP_PRIME_SIZE] = {
   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
 };
 
-int fp_isprime_ex(fp_int *a, int t)
+int fp_isprime_ex(fp_int *a, int t, int* result)
 {
-   fp_int   b;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int   b[1];
+#else
+   fp_int   *b;
+#endif
    fp_digit d;
    int      r, res;
 
@@ -2886,7 +3335,8 @@ int fp_isprime_ex(fp_int *a, int t)
 
    for (r = 0; r < FP_PRIME_SIZE; r++) {
        if (fp_cmp_d(a, primes[r]) == FP_EQ) {
-           return FP_YES;
+           *result = FP_YES;
+           return FP_OKAY;
        }
    }
 
@@ -2894,29 +3344,36 @@ int fp_isprime_ex(fp_int *a, int t)
    for (r = 0; r < FP_PRIME_SIZE; r++) {
        res = fp_mod_d(a, primes[r], &d);
        if (res != MP_OKAY || d == 0) {
-           return FP_NO;
+           *result = FP_NO;
+           return FP_OKAY;
        }
    }
 
-   /* now do 't' miller rabins */
-   fp_init(&b);
-   for (r = 0; r < t; r++) {
-       fp_set(&b, primes[r]);
-       fp_prime_miller_rabin(a, &b, &res);
-       if (res == FP_NO) {
-          return FP_NO;
-       }
-   }
-   return FP_YES;
-}
-
-#if 0
-/* Removed in favor of fp_isprime_ex(). */
-int fp_isprime(fp_int *a)
-{
-  return fp_isprime_ex(a, 8);
-}
+#ifdef WOLFSSL_SMALL_STACK
+  b = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (b == NULL)
+      return FP_MEM;
 #endif
+   /* now do 't' miller rabins */
+   fp_init(b);
+   for (r = 0; r < t; r++) {
+       fp_set(b, primes[r]);
+       fp_prime_miller_rabin(a, b, &res);
+       if (res == FP_NO) {
+          *result = FP_NO;
+       #ifdef WOLFSSL_SMALL_STACK
+          XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+       #endif
+          return FP_OKAY;
+       }
+   }
+   *result = FP_YES;
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
+}
+
 
 int mp_prime_is_prime_ex(mp_int* a, int t, int* result, WC_RNG* rng)
 {
@@ -2951,47 +3408,63 @@ int mp_prime_is_prime_ex(mp_int* a, int t, int* result, WC_RNG* rng)
     /* now do a miller rabin with up to t random numbers, this should
      * give a (1/4)^t chance of a false prime. */
     if (ret == FP_YES) {
-        fp_int b, c;
+    #ifndef WOLFSSL_SMALL_STACK
+        fp_int b[1], c[1];
+        byte   base[FP_MAX_PRIME_SIZE];
+    #else
+        fp_int *b, *c;
+        byte*  base;
+    #endif
         word32 baseSz;
-        #ifndef WOLFSSL_SMALL_STACK
-            byte base[FP_MAX_PRIME_SIZE];
-        #else
-            byte* base;
-        #endif
+        int    err;
 
         baseSz = fp_count_bits(a);
         /* The base size is the number of bits / 8. One is added if the number
          * of bits isn't an even 8. */
         baseSz = (baseSz / 8) + ((baseSz % 8) ? 1 : 0);
 
-        #ifndef WOLFSSL_SMALL_STACK
-            if (baseSz > sizeof(base))
-                return FP_MEM;
-        #else
-            base = (byte*)XMALLOC(baseSz, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-            if (base == NULL)
-                return FP_MEM;
-        #endif
+    #ifndef WOLFSSL_SMALL_STACK
+        if (baseSz > sizeof(base))
+            return FP_MEM;
+    #else
+        base = (byte*)XMALLOC(baseSz, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        if (base == NULL)
+            return FP_MEM;
 
-        fp_init(&b);
-        fp_init(&c);
-        fp_sub_d(a, 2, &c);
+        b = (fp_int*)XMALLOC(sizeof(fp_int) * 2, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        if (b == NULL) {
+            return FP_MEM;
+        }
+        c = &b[1];
+    #endif
+
+        fp_init(b);
+        fp_init(c);
+        err = fp_sub_d(a, 2, c);
+        if (err != FP_OKAY) {
+        #ifdef WOLFSSL_SMALL_STACK
+           XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+           XFREE(base, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        #endif
+           return err;
+        }
         while (t > 0) {
             wc_RNG_GenerateBlock(rng, base, baseSz);
-            fp_read_unsigned_bin(&b, base, baseSz);
-            if (fp_cmp_d(&b, 2) != FP_GT || fp_cmp(&b, &c) != FP_LT)
+            fp_read_unsigned_bin(b, base, baseSz);
+            if (fp_cmp_d(b, 2) != FP_GT || fp_cmp(b, c) != FP_LT)
                 continue;
-            fp_prime_miller_rabin(a, &b, &ret);
+            fp_prime_miller_rabin(a, b, &ret);
             if (ret == FP_NO)
                 break;
-            fp_zero(&b);
+            fp_zero(b);
             t--;
         }
-        fp_clear(&b);
-        fp_clear(&c);
-        #ifdef WOLFSSL_SMALL_STACK
-            XFREE(base, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-        #endif
+        fp_clear(b);
+        fp_clear(c);
+     #ifdef WOLFSSL_SMALL_STACK
+        XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        XFREE(base, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+     #endif
     }
 #else
     (void)t;
@@ -3005,21 +3478,19 @@ int mp_prime_is_prime_ex(mp_int* a, int t, int* result, WC_RNG* rng)
 
 #ifdef WOLFSSL_KEY_GEN
 
-static void fp_gcd(fp_int *a, fp_int *b, fp_int *c);
-static void fp_lcm(fp_int *a, fp_int *b, fp_int *c);
+static int  fp_gcd(fp_int *a, fp_int *b, fp_int *c);
+static int  fp_lcm(fp_int *a, fp_int *b, fp_int *c);
 static int  fp_randprime(fp_int* N, int len, WC_RNG* rng, void* heap);
 
 int mp_gcd(fp_int *a, fp_int *b, fp_int *c)
 {
-    fp_gcd(a, b, c);
-    return MP_OKAY;
+    return fp_gcd(a, b, c);
 }
 
 
 int mp_lcm(fp_int *a, fp_int *b, fp_int *c)
 {
-    fp_lcm(a, b, c);
-    return MP_OKAY;
+    return fp_lcm(a, b, c);
 }
 
 int mp_rand_prime(mp_int* N, int len, WC_RNG* rng, void* heap)
@@ -3041,8 +3512,7 @@ int mp_rand_prime(mp_int* N, int len, WC_RNG* rng, void* heap)
 
 int mp_exch (mp_int * a, mp_int * b)
 {
-    fp_exch(a, b);
-    return MP_OKAY;
+    return fp_exch(a, b);
 }
 
 
@@ -3110,37 +3580,62 @@ int fp_randprime(fp_int* N, int len, WC_RNG* rng, void* heap)
 }
 
 /* c = [a, b] */
-void fp_lcm(fp_int *a, fp_int *b, fp_int *c)
+int fp_lcm(fp_int *a, fp_int *b, fp_int *c)
 {
-   fp_int  t1, t2;
+   int     err;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int  t[2];
+#else
+   fp_int  *t;
+#endif
 
-   fp_init(&t1);
-   fp_init(&t2);
-   fp_gcd(a, b, &t1);
-   if (fp_cmp_mag(a, b) == FP_GT) {
-      fp_div(a, &t1, &t2, NULL);
-      fp_mul(b, &t2, c);
-   } else {
-      fp_div(b, &t1, &t2, NULL);
-      fp_mul(a, &t2, c);
+#ifdef WOLFSSL_SMALL_STACK
+   t = (fp_int*)XMALLOC(sizeof(fp_int) * 2, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (t == NULL) {
+       return FP_MEM;
    }
+#endif
+
+   fp_init(&t[0]);
+   fp_init(&t[1]);
+   err = fp_gcd(a, b, &t[0]);
+   if (err == FP_OKAY) {
+      if (fp_cmp_mag(a, b) == FP_GT) {
+        err = fp_div(a, &t[0], &t[1], NULL);
+        if (err == FP_OKAY)
+          err = fp_mul(b, &t[1], c);
+     } else {
+        err = fp_div(b, &t[0], &t[1], NULL);
+        if (err == FP_OKAY)
+          err = fp_mul(a, &t[1], c);
+     }
+   }
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return err;
 }
 
 
 
 /* c = (a, b) */
-void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
+int fp_gcd(fp_int *a, fp_int *b, fp_int *c)
 {
-   fp_int u, v, r;
+#ifndef WOLFSSL_SMALL_STACK
+   fp_int u[1], v[1], r[1];
+#else
+   fp_int *u, *v, *r;
+#endif
 
    /* either zero than gcd is the largest */
    if (fp_iszero (a) == FP_YES && fp_iszero (b) == FP_NO) {
      fp_abs (b, c);
-     return;
+     return FP_OKAY;
    }
    if (fp_iszero (a) == FP_NO && fp_iszero (b) == FP_YES) {
      fp_abs (a, c);
-     return;
+     return FP_OKAY;
    }
 
    /* optimized.  At this point if a == 0 then
@@ -3148,25 +3643,38 @@ void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
     */
    if (fp_iszero (a) == FP_YES) {
      fp_zero(c);
-     return;
+     return FP_OKAY;
    }
 
+#ifdef WOLFSSL_SMALL_STACK
+   u = (fp_int*)XMALLOC(sizeof(fp_int) * 3, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   if (u == NULL) {
+       return FP_MEM;
+   }
+   v = &u[1]; r = &u[2];
+#endif
+
    /* sort inputs */
    if (fp_cmp_mag(a, b) != FP_LT) {
-      fp_init_copy(&u, a);
-      fp_init_copy(&v, b);
+      fp_init_copy(u, a);
+      fp_init_copy(v, b);
    } else {
-      fp_init_copy(&u, b);
-      fp_init_copy(&v, a);
+      fp_init_copy(u, b);
+      fp_init_copy(v, a);
    }
 
-   fp_init(&r);
-   while (fp_iszero(&v) == FP_NO) {
-      fp_mod(&u, &v, &r);
-      fp_copy(&v, &u);
-      fp_copy(&r, &v);
+   fp_init(r);
+   while (fp_iszero(v) == FP_NO) {
+      fp_mod(u, v, r);
+      fp_copy(v, u);
+      fp_copy(r, v);
    }
-   fp_copy(&u, c);
+   fp_copy(u, c);
+
+#ifdef WOLFSSL_SMALL_STACK
+   XFREE(u, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+   return FP_OKAY;
 }
 
 #endif /* WOLFSSL_KEY_GEN */
@@ -3177,10 +3685,26 @@ void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
 /* c = a + b */
 void fp_add_d(fp_int *a, fp_digit b, fp_int *c)
 {
+#ifndef WOLFSSL_SMALL_STACK
    fp_int tmp;
    fp_init(&tmp);
    fp_set(&tmp, b);
    fp_add(a, &tmp, c);
+#else
+   int i;
+   fp_word t = b;
+
+   fp_copy(a, c);
+   for (i = 0; t != 0 && i < FP_SIZE && i < c->used; i++) {
+     t += c->dp[i];
+     c->dp[i] = (fp_digit)t;
+     t >>= DIGIT_BIT;
+   }
+   if (i == c->used && i < FP_SIZE && t != 0) {
+       c->dp[i] = t;
+       c->used++;
+   }
+#endif
 }
 
 /* external compatibility */
@@ -3322,15 +3846,13 @@ int mp_read_radix(mp_int *a, const char *str, int radix)
 /* fast math conversion */
 int mp_sqr(fp_int *A, fp_int *B)
 {
-    fp_sqr(A, B);
-    return MP_OKAY;
+    return fp_sqr(A, B);
 }
 
 /* fast math conversion */
 int mp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
 {
-    fp_montgomery_reduce(a, m, mp);
-    return MP_OKAY;
+    return fp_montgomery_reduce(a, m, mp);
 }
 
 
@@ -3378,9 +3900,13 @@ int mp_set(fp_int *a, fp_digit b)
 /* returns size of ASCII representation */
 int mp_radix_size (mp_int *a, int radix, int *size)
 {
-    int     res, digs;
-    fp_int  t;
+    int      res, digs;
     fp_digit d;
+#ifndef WOLFSSL_SMALL_STACK
+    fp_int   t[1];
+#else
+    fp_int   *t;
+#endif
 
     *size = 0;
 
@@ -3408,34 +3934,50 @@ int mp_radix_size (mp_int *a, int radix, int *size)
         ++digs;
     }
 
+#ifdef WOLFSSL_SMALL_STACK
+    t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    if (t == NULL)
+        return FP_MEM;
+#endif
+
     /* init a copy of the input */
-    fp_init_copy (&t, a);
+    fp_init_copy (t, a);
 
     /* force temp to positive */
-    t.sign = FP_ZPOS;
+    t->sign = FP_ZPOS;
 
     /* fetch out all of the digits */
-    while (fp_iszero (&t) == FP_NO) {
-        if ((res = fp_div_d (&t, (mp_digit) radix, &t, &d)) != FP_OKAY) {
-            fp_zero (&t);
+    while (fp_iszero (t) == FP_NO) {
+        if ((res = fp_div_d (t, (mp_digit) radix, t, &d)) != FP_OKAY) {
+            fp_zero (t);
+        #ifdef WOLFSSL_SMALL_STACK
+            XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        #endif
             return res;
         }
         ++digs;
     }
-    fp_zero (&t);
+    fp_zero (t);
 
     /* return digs + 1, the 1 is for the NULL byte that would be required. */
     *size = digs + 1;
+#ifdef WOLFSSL_SMALL_STACK
+    XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
     return FP_OKAY;
 }
 
 /* stores a bignum as a ASCII string in a given radix (2..64) */
 int mp_toradix (mp_int *a, char *str, int radix)
 {
-    int     res, digs;
-    fp_int  t;
+    int      res, digs;
     fp_digit d;
-    char   *_s = str;
+    char     *_s = str;
+#ifndef WOLFSSL_SMALL_STACK
+    fp_int   t[1];
+#else
+    fp_int   *t;
+#endif
 
     /* check range of the radix */
     if (radix < 2 || radix > 64) {
@@ -3449,20 +3991,29 @@ int mp_toradix (mp_int *a, char *str, int radix)
         return FP_YES;
     }
 
+#ifdef WOLFSSL_SMALL_STACK
+    t = (fp_int*)XMALLOC(sizeof(fp_int), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    if (t == NULL)
+        return FP_MEM;
+#endif
+
     /* init a copy of the input */
-    fp_init_copy (&t, a);
+    fp_init_copy (t, a);
 
     /* if it is negative output a - */
-    if (t.sign == FP_NEG) {
+    if (t->sign == FP_NEG) {
         ++_s;
         *str++ = '-';
-        t.sign = FP_ZPOS;
+        t->sign = FP_ZPOS;
     }
 
     digs = 0;
-    while (fp_iszero (&t) == FP_NO) {
-        if ((res = fp_div_d (&t, (fp_digit) radix, &t, &d)) != FP_OKAY) {
-            fp_zero (&t);
+    while (fp_iszero (t) == FP_NO) {
+        if ((res = fp_div_d (t, (fp_digit) radix, t, &d)) != FP_OKAY) {
+            fp_zero (t);
+        #ifdef WOLFSSL_SMALL_STACK
+            XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        #endif
             return res;
         }
         *str++ = fp_s_rmap[d];
@@ -3477,7 +4028,10 @@ int mp_toradix (mp_int *a, char *str, int radix)
     /* append a NULL so the string is properly terminated */
     *str = '\0';
 
-    fp_zero (&t);
+    fp_zero (t);
+#ifdef WOLFSSL_SMALL_STACK
+    XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
     return FP_OKAY;
 }
 
diff --git a/wolfssl/wolfcrypt/tfm.h b/wolfssl/wolfcrypt/tfm.h
index cc269c44f..f1f525e9c 100644
--- a/wolfssl/wolfcrypt/tfm.h
+++ b/wolfssl/wolfcrypt/tfm.h
@@ -467,10 +467,10 @@ void fp_add(fp_int *a, fp_int *b, fp_int *c);
 void fp_sub(fp_int *a, fp_int *b, fp_int *c);
 
 /* c = a * b */
-void fp_mul(fp_int *a, fp_int *b, fp_int *c);
+int fp_mul(fp_int *a, fp_int *b, fp_int *c);
 
 /* b = a*a  */
-void fp_sqr(fp_int *a, fp_int *b);
+int fp_sqr(fp_int *a, fp_int *b);
 
 /* a/b => cb + d == a */
 int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
@@ -485,7 +485,7 @@ int fp_cmp_d(fp_int *a, fp_digit b);
 void fp_add_d(fp_int *a, fp_digit b, fp_int *c);
 
 /* c = a - b */
-void fp_sub_d(fp_int *a, fp_digit b, fp_int *c);
+int fp_sub_d(fp_int *a, fp_digit b, fp_int *c);
 
 /* c = a * b */
 void fp_mul_d(fp_int *a, fp_digit b, fp_int *c);
@@ -519,10 +519,10 @@ int fp_sqrmod(fp_int *a, fp_int *b, fp_int *c);
 int fp_invmod(fp_int *a, fp_int *b, fp_int *c);
 
 /* c = (a, b) */
-/*void fp_gcd(fp_int *a, fp_int *b, fp_int *c);*/
+/*int fp_gcd(fp_int *a, fp_int *b, fp_int *c);*/
 
 /* c = [a, b] */
-/*void fp_lcm(fp_int *a, fp_int *b, fp_int *c);*/
+/*int fp_lcm(fp_int *a, fp_int *b, fp_int *c);*/
 
 /* setups the montgomery reduction */
 int fp_montgomery_setup(fp_int *a, fp_digit *mp);
@@ -533,7 +533,7 @@ int fp_montgomery_setup(fp_int *a, fp_digit *mp);
 void fp_montgomery_calc_normalization(fp_int *a, fp_int *b);
 
 /* computes x/R == x (mod N) via Montgomery Reduction */
-void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp);
+int fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp);
 
 /* d = a**b (mod c) */
 int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
@@ -547,7 +547,7 @@ int fp_exptmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d);
 /* 256 trial divisions + 8 Miller-Rabins, returns FP_YES if probable prime  */
 /*int fp_isprime(fp_int *a);*/
 /* extended version of fp_isprime, do 't' Miller-Rabins instead of only 8 */
-/*int fp_isprime_ex(fp_int *a, int t);*/
+/*int fp_isprime_ex(fp_int *a, int t, int* result);*/
 
 /* Primality generation flags */
 /*#define TFM_PRIME_BBS      0x0001 */ /* BBS style prime */
@@ -568,7 +568,7 @@ int fp_leading_bit(fp_int *a);
 
 int fp_unsigned_bin_size(fp_int *a);
 void fp_read_unsigned_bin(fp_int *a, const unsigned char *b, int c);
-void fp_to_unsigned_bin(fp_int *a, unsigned char *b);
+int fp_to_unsigned_bin(fp_int *a, unsigned char *b);
 int fp_to_unsigned_bin_at_pos(int x, fp_int *t, unsigned char *b);
 
 /*int fp_signed_bin_size(fp_int *a);*/
@@ -585,39 +585,39 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c);
 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c);
 void fp_reverse(unsigned char *s, int len);
 
-void fp_mul_comba(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba(fp_int *a, fp_int *b, fp_int *c);
 
-void fp_mul_comba_small(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba3(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba4(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba6(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba7(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba8(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba9(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba12(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba17(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba20(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba24(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba28(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba32(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba48(fp_int *a, fp_int *b, fp_int *c);
-void fp_mul_comba64(fp_int *a, fp_int *b, fp_int *c);
-void fp_sqr_comba(fp_int *a, fp_int *b);
-void fp_sqr_comba_small(fp_int *a, fp_int *b);
-void fp_sqr_comba3(fp_int *a, fp_int *b);
-void fp_sqr_comba4(fp_int *a, fp_int *b);
-void fp_sqr_comba6(fp_int *a, fp_int *b);
-void fp_sqr_comba7(fp_int *a, fp_int *b);
-void fp_sqr_comba8(fp_int *a, fp_int *b);
-void fp_sqr_comba9(fp_int *a, fp_int *b);
-void fp_sqr_comba12(fp_int *a, fp_int *b);
-void fp_sqr_comba17(fp_int *a, fp_int *b);
-void fp_sqr_comba20(fp_int *a, fp_int *b);
-void fp_sqr_comba24(fp_int *a, fp_int *b);
-void fp_sqr_comba28(fp_int *a, fp_int *b);
-void fp_sqr_comba32(fp_int *a, fp_int *b);
-void fp_sqr_comba48(fp_int *a, fp_int *b);
-void fp_sqr_comba64(fp_int *a, fp_int *b);
+int  fp_mul_comba_small(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba3(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba4(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba6(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba7(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba8(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba9(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba12(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba17(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba20(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba24(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba28(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba32(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba48(fp_int *a, fp_int *b, fp_int *c);
+int  fp_mul_comba64(fp_int *a, fp_int *b, fp_int *c);
+int  fp_sqr_comba(fp_int *a, fp_int *b);
+int  fp_sqr_comba_small(fp_int *a, fp_int *b);
+int  fp_sqr_comba3(fp_int *a, fp_int *b);
+int  fp_sqr_comba4(fp_int *a, fp_int *b);
+int  fp_sqr_comba6(fp_int *a, fp_int *b);
+int  fp_sqr_comba7(fp_int *a, fp_int *b);
+int  fp_sqr_comba8(fp_int *a, fp_int *b);
+int  fp_sqr_comba9(fp_int *a, fp_int *b);
+int  fp_sqr_comba12(fp_int *a, fp_int *b);
+int  fp_sqr_comba17(fp_int *a, fp_int *b);
+int  fp_sqr_comba20(fp_int *a, fp_int *b);
+int  fp_sqr_comba24(fp_int *a, fp_int *b);
+int  fp_sqr_comba28(fp_int *a, fp_int *b);
+int  fp_sqr_comba32(fp_int *a, fp_int *b);
+int  fp_sqr_comba48(fp_int *a, fp_int *b);
+int  fp_sqr_comba64(fp_int *a, fp_int *b);
 
 
 /**