diff --git a/IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h b/IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h
index 7947ef34e..f8d751ff0 100644
--- a/IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h
+++ b/IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h
@@ -79,12 +79,13 @@ extern "C" {
     #define ECC_TIMING_RESISTANT
 
     #ifdef USE_FAST_MATH
-        /* Max ECC bits (curve size * 8). ECC256 is (32*8) = 256 */
-        /* Note: ECC521 requires (curve size * 16): (66*16) = 1056 */
+        /* use reduced size math buffers for ecc points */
         #undef  ALT_ECC_SIZE
         #define ALT_ECC_SIZE
+
+        /* optionally override the default max ecc bits */
         #undef  FP_MAX_BITS_ECC
-        #define FP_MAX_BITS_ECC     1056
+        //#define FP_MAX_BITS_ECC     512
 
         /* Enable TFM optimizations for ECC */
         #define TFM_ECC192
diff --git a/configure.ac b/configure.ac
index a46fcc17b..3f496a7d7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -731,6 +731,14 @@ then
 ECC_DEFAULT=yes
 fi
 
+# ECC Shamir
+AC_ARG_ENABLE([eccshamir],
+    [AS_HELP_STRING([--enable-eccshamir],[Enable ECC Shamir (default: enabled on x86_64)])],
+    [ ENABLED_ECC_SHAMIR=$enableval ],
+    [ ENABLED_ECC_SHAMIR=$ECC_DEFAULT ]
+    )
+
+
 # ECC
 AC_ARG_ENABLE([ecc],
     [AS_HELP_STRING([--enable-ecc],[Enable ECC (default: enabled on x86_64)])],
@@ -751,7 +759,11 @@ fi
 
 if test "$ENABLED_ECC" = "yes"
 then
-    AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
+    AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
+    if test "$ENABLED_ECC_SHAMIR" = "yes"
+    then
+        AM_CFLAGS="$AM_CFLAGS -DECC_SHAMIR"
+    fi
 fi
 
 AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
@@ -1961,8 +1973,13 @@ then
     if test "x$ENABLED_ECC" = "xno"
     then
         ENABLED_ECC="yes"
-        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
+        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
         AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
+        
+        if test "$ENABLED_ECC_SHAMIR" = "yes"
+        then
+            AM_CFLAGS="$AM_CFLAGS -DECC_SHAMIR"
+        fi
     fi
     if test "x$ENABLED_OPENSSLEXTRA" = "xno"
     then
@@ -2029,8 +2046,13 @@ then
     if test "x$ENABLED_ECC" = "xno"
     then
         ENABLED_ECC="yes"
-        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
+        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
         AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
+        
+        if test "$ENABLED_ECC_SHAMIR" = "yes"
+        then
+            AM_CFLAGS="$AM_CFLAGS -DECC_SHAMIR"
+        fi
     fi
     if test "x$ENABLED_PKCALLBACKS" = "xno"
     then
@@ -2122,8 +2144,13 @@ then
     then
         ENABLED_OPENSSLEXTRA="yes"
         ENABLED_ECC="yes"
-        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
+        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
         AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
+        
+        if test "$ENABLED_ECC_SHAMIR" = "yes"
+        then
+            AM_CFLAGS="$AM_CFLAGS -DECC_SHAMIR"
+        fi
     fi
 
     AM_CFLAGS="$AM_CFLAGS -DHAVE_STUNNEL -DWOLFSSL_ALWAYS_VERIFY_CB"
diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c
index f3a47793b..9167eac36 100644
--- a/wolfcrypt/src/ecc.c
+++ b/wolfcrypt/src/ecc.c
@@ -268,285 +268,6 @@ static mp_digit get_digit(mp_int* a, int n)
 }
 
 
-#if defined(USE_FAST_MATH)
-
-/* fast math accelerated version, but not for fp ecc yet */
-
-/**
-   Add two ECC points
-   P        The point to add
-   Q        The point to add
-   R        [out] The destination of the double
-   modulus  The modulus of the field the ECC curve is in
-   mp       The "b" value from montgomery_setup()
-   return   MP_OKAY on success
-*/
-int ecc_projective_add_point(ecc_point *P, ecc_point *Q, ecc_point *R,
-                             mp_int* modulus, mp_digit* mp)
-{
-   fp_int t1, t2, x, y, z;
-   int    err;
-
-   if (P == NULL || Q == NULL || R == NULL || modulus == NULL || mp == NULL)
-       return ECC_BAD_ARG_E;
-
-   if ((err = mp_init_multi(&t1, &t2, &x, &y, &z, NULL)) != MP_OKAY) {
-      return err;
-   }
-
-   /* should we dbl instead? */
-   fp_sub(modulus, Q->y, &t1);
-   if ( (fp_cmp(P->x, Q->x) == FP_EQ) &&
-        (get_digit_count(Q->z) && fp_cmp(P->z, Q->z) == FP_EQ) &&
-        (fp_cmp(P->y, Q->y) == FP_EQ || fp_cmp(P->y, &t1) == FP_EQ)) {
-        return ecc_projective_dbl_point(P, R, modulus, mp);
-   }
-
-   fp_copy(P->x, &x);
-   fp_copy(P->y, &y);
-   fp_copy(P->z, &z);
-
-   /* if Z is one then these are no-operations */
-   if (get_digit_count(Q->z)) {
-      /* T1 = Z' * Z' */
-      fp_sqr(Q->z, &t1);
-      fp_montgomery_reduce(&t1, modulus, *mp);
-      /* X = X * T1 */
-      fp_mul(&t1, &x, &x);
-      fp_montgomery_reduce(&x, modulus, *mp);
-      /* T1 = Z' * T1 */
-      fp_mul(Q->z, &t1, &t1);
-      fp_montgomery_reduce(&t1, modulus, *mp);
-      /* Y = Y * T1 */
-      fp_mul(&t1, &y, &y);
-      fp_montgomery_reduce(&y, modulus, *mp);
-   }
-
-   /* T1 = Z*Z */
-   fp_sqr(&z, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-   /* T2 = X' * T1 */
-   fp_mul(Q->x, &t1, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* T1 = Z * T1 */
-   fp_mul(&z, &t1, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-   /* T1 = Y' * T1 */
-   fp_mul(Q->y, &t1, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-
-   /* Y = Y - T1 */
-   fp_sub(&y, &t1, &y);
-   if (fp_cmp_d(&y, 0) == FP_LT) {
-      fp_add(&y, modulus, &y);
-   }
-   /* T1 = 2T1 */
-   fp_add(&t1, &t1, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-   /* T1 = Y + T1 */
-   fp_add(&t1, &y, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-   /* X = X - T2 */
-   fp_sub(&x, &t2, &x);
-   if (fp_cmp_d(&x, 0) == FP_LT) {
-      fp_add(&x, modulus, &x);
-   }
-   /* T2 = 2T2 */
-   fp_add(&t2, &t2, &t2);
-   if (fp_cmp(&t2, modulus) != FP_LT) {
-      fp_sub(&t2, modulus, &t2);
-   }
-   /* T2 = X + T2 */
-   fp_add(&t2, &x, &t2);
-   if (fp_cmp(&t2, modulus) != FP_LT) {
-      fp_sub(&t2, modulus, &t2);
-   }
-
-   /* if Z' != 1 */
-   if (get_digit_count(Q->z)) {
-      /* Z = Z * Z' */
-      fp_mul(&z, Q->z, &z);
-      fp_montgomery_reduce(&z, modulus, *mp);
-   }
-
-   /* Z = Z * X */
-   fp_mul(&z, &x, &z);
-   fp_montgomery_reduce(&z, modulus, *mp);
-
-   /* T1 = T1 * X  */
-   fp_mul(&t1, &x, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-   /* X = X * X */
-   fp_sqr(&x, &x);
-   fp_montgomery_reduce(&x, modulus, *mp);
-   /* T2 = T2 * x */
-   fp_mul(&t2, &x, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* T1 = T1 * X  */
-   fp_mul(&t1, &x, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-
-   /* X = Y*Y */
-   fp_sqr(&y, &x);
-   fp_montgomery_reduce(&x, modulus, *mp);
-   /* X = X - T2 */
-   fp_sub(&x, &t2, &x);
-   if (fp_cmp_d(&x, 0) == FP_LT) {
-      fp_add(&x, modulus, &x);
-   }
-
-   /* T2 = T2 - X */
-   fp_sub(&t2, &x, &t2);
-   if (fp_cmp_d(&t2, 0) == FP_LT) {
-      fp_add(&t2, modulus, &t2);
-   }
-   /* T2 = T2 - X */
-   fp_sub(&t2, &x, &t2);
-   if (fp_cmp_d(&t2, 0) == FP_LT) {
-      fp_add(&t2, modulus, &t2);
-   }
-   /* T2 = T2 * Y */
-   fp_mul(&t2, &y, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* Y = T2 - T1 */
-   fp_sub(&t2, &t1, &y);
-   if (fp_cmp_d(&y, 0) == FP_LT) {
-      fp_add(&y, modulus, &y);
-   }
-   /* Y = Y/2 */
-   if (fp_isodd(&y)) {
-      fp_add(&y, modulus, &y);
-   }
-   fp_div_2(&y, &y);
-
-   fp_copy(&x, R->x);
-   fp_copy(&y, R->y);
-   fp_copy(&z, R->z);
-
-   return MP_OKAY;
-}
-
-
-/**
-   Double an ECC point
-   P   The point to double
-   R   [out] The destination of the double
-   modulus  The modulus of the field the ECC curve is in
-   mp       The "b" value from montgomery_setup()
-   return   MP_OKAY on success
-*/
-int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
-                             mp_digit* mp)
-{
-   fp_int   t1, t2;
-   int      err;
-
-   if (P == NULL || R == NULL || modulus == NULL || mp == NULL)
-       return ECC_BAD_ARG_E;
-
-   if (P != R) {
-      fp_copy(P->x, R->x);
-      fp_copy(P->y, R->y);
-      fp_copy(P->z, R->z);
-   }
-
-   if ((err = mp_init_multi(&t1, &t2, NULL, NULL, NULL, NULL)) != MP_OKAY) {
-      return err;
-   }
-
-   /* t1 = Z * Z */
-   fp_sqr(R->z, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-   /* Z = Y * Z */
-   fp_mul(R->z, R->y, R->z);
-   fp_montgomery_reduce(R->z, modulus, *mp);
-   /* Z = 2Z */
-   fp_add(R->z, R->z, R->z);
-   if (fp_cmp(R->z, modulus) != FP_LT) {
-      fp_sub(R->z, modulus, R->z);
-   }
-
-   /* &t2 = X - T1 */
-   fp_sub(R->x, &t1, &t2);
-   if (fp_cmp_d(&t2, 0) == FP_LT) {
-      fp_add(&t2, modulus, &t2);
-   }
-   /* T1 = X + T1 */
-   fp_add(&t1, R->x, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-   /* T2 = T1 * T2 */
-   fp_mul(&t1, &t2, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* T1 = 2T2 */
-   fp_add(&t2, &t2, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-   /* T1 = T1 + T2 */
-   fp_add(&t1, &t2, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-
-   /* Y = 2Y */
-   fp_add(R->y, R->y, R->y);
-   if (fp_cmp(R->y, modulus) != FP_LT) {
-      fp_sub(R->y, modulus, R->y);
-   }
-   /* Y = Y * Y */
-   fp_sqr(R->y, R->y);
-   fp_montgomery_reduce(R->y, modulus, *mp);
-   /* T2 = Y * Y */
-   fp_sqr(R->y, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* T2 = T2/2 */
-   if (fp_isodd(&t2)) {
-      fp_add(&t2, modulus, &t2);
-   }
-   fp_div_2(&t2, &t2);
-   /* Y = Y * X */
-   fp_mul(R->y, R->x, R->y);
-   fp_montgomery_reduce(R->y, modulus, *mp);
-
-   /* X  = T1 * T1 */
-   fp_sqr(&t1, R->x);
-   fp_montgomery_reduce(R->x, modulus, *mp);
-   /* X = X - Y */
-   fp_sub(R->x, R->y, R->x);
-   if (fp_cmp_d(R->x, 0) == FP_LT) {
-      fp_add(R->x, modulus, R->x);
-   }
-   /* X = X - Y */
-   fp_sub(R->x, R->y, R->x);
-   if (fp_cmp_d(R->x, 0) == FP_LT) {
-      fp_add(R->x, modulus, R->x);
-   }
-
-   /* Y = Y - X */
-   fp_sub(R->y, R->x, R->y);
-   if (fp_cmp_d(R->y, 0) == FP_LT) {
-      fp_add(R->y, modulus, R->y);
-   }
-   /* Y = Y * T1 */
-   fp_mul(R->y, &t1, R->y);
-   fp_montgomery_reduce(R->y, modulus, *mp);
-   /* Y = Y - T2 */
-   fp_sub(R->y, &t2, R->y);
-   if (fp_cmp_d(R->y, 0) == FP_LT) {
-      fp_add(R->y, modulus, R->y);
-   }
-
-   return MP_OKAY;
-}
-
-#else /* USE_FAST_MATH */
-
 /**
    Add two ECC points
    P        The point to add
@@ -559,43 +280,63 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
 int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
                              mp_int* modulus, mp_digit* mp)
 {
-   mp_int t1;
-   mp_int t2;
-   mp_int x;
-   mp_int y;
-   mp_int z;
+   mp_int t1, t2;
+#if (defined(USE_FAST_MATH) && defined(ALT_ECC_SIZE)) || !defined(USE_FAST_MATH)
+   mp_int rx, ry, rz;
+#endif
+   mp_int *x, *y, *z;
    int    err;
 
    if (P == NULL || Q == NULL || R == NULL || modulus == NULL || mp == NULL)
        return ECC_BAD_ARG_E;
 
-   if ((err = mp_init_multi(&t1, &t2, &x, &y, &z, NULL)) != MP_OKAY) {
+   if ((err = mp_init_multi(&t1, &t2, NULL, NULL, NULL, NULL)) != MP_OKAY) {
       return err;
    }
 
    /* should we dbl instead? */
-   err = mp_sub(modulus, Q->y, &t1);
-
+   if (err == MP_OKAY)
+       err = mp_sub(modulus, Q->y, &t1);
    if (err == MP_OKAY) {
        if ( (mp_cmp(P->x, Q->x) == MP_EQ) &&
             (get_digit_count(Q->z) && mp_cmp(P->z, Q->z) == MP_EQ) &&
             (mp_cmp(P->y, Q->y) == MP_EQ || mp_cmp(P->y, &t1) == MP_EQ)) {
                 mp_clear(&t1);
                 mp_clear(&t2);
-                mp_clear(&x);
-                mp_clear(&y);
-                mp_clear(&z);
-
                 return ecc_projective_dbl_point(P, R, modulus, mp);
        }
    }
+   
+   if (err != MP_OKAY) {
+      mp_clear(&t1);
+      mp_clear(&t2);
+      return err;
+   }
+
+#if (defined(USE_FAST_MATH) && defined(ALT_ECC_SIZE)) || !defined(USE_FAST_MATH)
+   /* Use local stack variable */
+   x = &rx;
+   y = &ry;
+   z = &rz;
+
+   if ((err = mp_init_multi(x, y, z, NULL, NULL, NULL)) != MP_OKAY) {
+      mp_clear(&t1);
+      mp_clear(&t2);
+      return err;
+   }
+#else
+   /* Use destination directly */
+   x = R->x;
+   y = R->y;
+   z = R->z;
+#endif
 
    if (err == MP_OKAY)
-       err = mp_copy(P->x, &x);
+       err = mp_copy(P->x, x);
    if (err == MP_OKAY)
-       err = mp_copy(P->y, &y);
+       err = mp_copy(P->y, y);
    if (err == MP_OKAY)
-       err = mp_copy(P->z, &z);
+       err = mp_copy(P->z, z);
 
    /* if Z is one then these are no-operations */
    if (err == MP_OKAY) {
@@ -607,9 +348,9 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 
            /* X = X * T1 */
            if (err == MP_OKAY)
-               err = mp_mul(&t1, &x, &x);
+               err = mp_mul(&t1, x, x);
            if (err == MP_OKAY)
-               err = mp_montgomery_reduce(&x, modulus, *mp);
+               err = mp_montgomery_reduce(x, modulus, *mp);
 
            /* T1 = Z' * T1 */
            if (err == MP_OKAY)
@@ -619,15 +360,15 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 
            /* Y = Y * T1 */
            if (err == MP_OKAY)
-               err = mp_mul(&t1, &y, &y);
+               err = mp_mul(&t1, y, y);
            if (err == MP_OKAY)
-               err = mp_montgomery_reduce(&y, modulus, *mp);
+               err = mp_montgomery_reduce(y, modulus, *mp);
        }
    }
 
    /* T1 = Z*Z */
    if (err == MP_OKAY)
-       err = mp_sqr(&z, &t1);
+       err = mp_sqr(z, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
@@ -639,7 +380,7 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 
    /* T1 = Z * T1 */
    if (err == MP_OKAY)
-       err = mp_mul(&z, &t1, &t1);
+       err = mp_mul(z, &t1, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
@@ -651,10 +392,10 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 
    /* Y = Y - T1 */
    if (err == MP_OKAY)
-       err = mp_sub(&y, &t1, &y);
+       err = mp_sub(y, &t1, y);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(&y, 0) == MP_LT)
-           err = mp_add(&y, modulus, &y);
+       if (mp_cmp_d(y, 0) == MP_LT)
+           err = mp_add(y, modulus, y);
    }
    /* T1 = 2T1 */
    if (err == MP_OKAY)
@@ -665,17 +406,17 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
    }
    /* T1 = Y + T1 */
    if (err == MP_OKAY)
-       err = mp_add(&t1, &y, &t1);
+       err = mp_add(&t1, y, &t1);
    if (err == MP_OKAY) {
        if (mp_cmp(&t1, modulus) != MP_LT)
            err = mp_sub(&t1, modulus, &t1);
    }
    /* X = X - T2 */
    if (err == MP_OKAY)
-       err = mp_sub(&x, &t2, &x);
+       err = mp_sub(x, &t2, x);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(&x, 0) == MP_LT)
-           err = mp_add(&x, modulus, &x);
+       if (mp_cmp_d(x, 0) == MP_LT)
+           err = mp_add(x, modulus, x);
    }
    /* T2 = 2T2 */
    if (err == MP_OKAY)
@@ -686,7 +427,7 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
    }
    /* T2 = X + T2 */
    if (err == MP_OKAY)
-       err = mp_add(&t2, &x, &t2);
+       err = mp_add(&t2, x, &t2);
    if (err == MP_OKAY) {
        if (mp_cmp(&t2, modulus) != MP_LT)
            err = mp_sub(&t2, modulus, &t2);
@@ -695,103 +436,104 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
    if (err == MP_OKAY) {
        if (get_digit_count(Q->z)) {
            /* Z = Z * Z' */
-           err = mp_mul(&z, Q->z, &z);
+           err = mp_mul(z, Q->z, z);
            if (err == MP_OKAY)
-               err = mp_montgomery_reduce(&z, modulus, *mp);
+               err = mp_montgomery_reduce(z, modulus, *mp);
        }
    }
 
    /* Z = Z * X */
    if (err == MP_OKAY)
-       err = mp_mul(&z, &x, &z);
+       err = mp_mul(z, x, z);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(&z, modulus, *mp);
+       err = mp_montgomery_reduce(z, modulus, *mp);
 
    /* T1 = T1 * X  */
    if (err == MP_OKAY)
-       err = mp_mul(&t1, &x, &t1);
+       err = mp_mul(&t1, x, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
    /* X = X * X */
    if (err == MP_OKAY)
-       err = mp_sqr(&x, &x);
+       err = mp_sqr(x, x);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(&x, modulus, *mp);
+       err = mp_montgomery_reduce(x, modulus, *mp);
 
    /* T2 = T2 * x */
    if (err == MP_OKAY)
-       err = mp_mul(&t2, &x, &t2);
+       err = mp_mul(&t2, x, &t2);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t2, modulus, *mp);
 
    /* T1 = T1 * X  */
    if (err == MP_OKAY)
-       err = mp_mul(&t1, &x, &t1);
+       err = mp_mul(&t1, x, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
    /* X = Y*Y */
    if (err == MP_OKAY)
-       err = mp_sqr(&y, &x);
+       err = mp_sqr(y, x);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(&x, modulus, *mp);
+       err = mp_montgomery_reduce(x, modulus, *mp);
 
    /* X = X - T2 */
    if (err == MP_OKAY)
-       err = mp_sub(&x, &t2, &x);
+       err = mp_sub(x, &t2, x);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(&x, 0) == MP_LT)
-           err = mp_add(&x, modulus, &x);
+       if (mp_cmp_d(x, 0) == MP_LT)
+           err = mp_add(x, modulus, x);
    }
    /* T2 = T2 - X */
    if (err == MP_OKAY)
-       err = mp_sub(&t2, &x, &t2);
+       err = mp_sub(&t2, x, &t2);
    if (err == MP_OKAY) {
        if (mp_cmp_d(&t2, 0) == MP_LT)
            err = mp_add(&t2, modulus, &t2);
    }
    /* T2 = T2 - X */
    if (err == MP_OKAY)
-       err = mp_sub(&t2, &x, &t2);
+       err = mp_sub(&t2, x, &t2);
    if (err == MP_OKAY) {
        if (mp_cmp_d(&t2, 0) == MP_LT)
            err = mp_add(&t2, modulus, &t2);
    }
    /* T2 = T2 * Y */
    if (err == MP_OKAY)
-       err = mp_mul(&t2, &y, &t2);
+       err = mp_mul(&t2, y, &t2);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t2, modulus, *mp);
 
    /* Y = T2 - T1 */
    if (err == MP_OKAY)
-       err = mp_sub(&t2, &t1, &y);
+       err = mp_sub(&t2, &t1, y);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(&y, 0) == MP_LT)
-           err = mp_add(&y, modulus, &y);
+       if (mp_cmp_d(y, 0) == MP_LT)
+           err = mp_add(y, modulus, y);
    }
    /* Y = Y/2 */
    if (err == MP_OKAY) {
-       if (mp_isodd(&y))
-           err = mp_add(&y, modulus, &y);
+       if (mp_isodd(y))
+           err = mp_add(y, modulus, y);
    }
    if (err == MP_OKAY)
-       err = mp_div_2(&y, &y);
+       err = mp_div_2(y, y);
 
+#if (defined(USE_FAST_MATH) && defined(ALT_ECC_SIZE)) || !defined(USE_FAST_MATH)
    if (err == MP_OKAY)
-       err = mp_copy(&x, R->x);
+       err = mp_copy(x, R->x);
    if (err == MP_OKAY)
-       err = mp_copy(&y, R->y);
+       err = mp_copy(y, R->y);
    if (err == MP_OKAY)
-       err = mp_copy(&z, R->z);
+       err = mp_copy(z, R->z);
+#endif
 
+#ifndef USE_FAST_MATH
    /* clean up */
    mp_clear(&t1);
    mp_clear(&t2);
-   mp_clear(&x);
-   mp_clear(&y);
-   mp_clear(&z);
+#endif
 
    return err;
 }
@@ -808,8 +550,11 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
                              mp_digit* mp)
 {
-   mp_int t1;
-   mp_int t2;
+   mp_int t1, t2;
+#ifdef ALT_ECC_SIZE
+   mp_int rx, ry, rz;
+#endif
+   mp_int *x, *y, *z;
    int    err;
 
    if (P == NULL || R == NULL || modulus == NULL || mp == NULL)
@@ -819,44 +564,61 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
       return err;
    }
 
-   if (P != R) {
-      err = mp_copy(P->x, R->x);
-      if (err == MP_OKAY)
-          err = mp_copy(P->y, R->y);
-      if (err == MP_OKAY)
-          err = mp_copy(P->z, R->z);
+#ifdef ALT_ECC_SIZE
+   /* Use local stack variable */
+   x = &rx;
+   y = &ry;
+   z = &rz;
+
+   if ((err = mp_init_multi(x, y, z, NULL, NULL, NULL)) != MP_OKAY) {
+       mp_clear(&t1);
+       mp_clear(&t2);
+       return err;
    }
+#else
+   /* Use destination directly */
+   x = R->x;
+   y = R->y;
+   z = R->z;
+#endif
+
+   if (err == MP_OKAY)
+       err = mp_copy(P->x, x);
+   if (err == MP_OKAY)
+       err = mp_copy(P->y, y);
+   if (err == MP_OKAY)
+       err = mp_copy(P->z, z);
 
    /* t1 = Z * Z */
    if (err == MP_OKAY)
-       err = mp_sqr(R->z, &t1);
+       err = mp_sqr(z, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
    /* Z = Y * Z */
    if (err == MP_OKAY)
-       err = mp_mul(R->z, R->y, R->z);
+       err = mp_mul(z, y, z);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->z, modulus, *mp);
+       err = mp_montgomery_reduce(z, modulus, *mp);
 
    /* Z = 2Z */
    if (err == MP_OKAY)
-       err = mp_add(R->z, R->z, R->z);
+       err = mp_add(z, z, z);
    if (err == MP_OKAY) {
-       if (mp_cmp(R->z, modulus) != MP_LT)
-           err = mp_sub(R->z, modulus, R->z);
+       if (mp_cmp(z, modulus) != MP_LT)
+           err = mp_sub(z, modulus, z);
    }
 
    /* T2 = X - T1 */
    if (err == MP_OKAY)
-       err = mp_sub(R->x, &t1, &t2);
+       err = mp_sub(x, &t1, &t2);
    if (err == MP_OKAY) {
        if (mp_cmp_d(&t2, 0) == MP_LT)
            err = mp_add(&t2, modulus, &t2);
    }
    /* T1 = X + T1 */
    if (err == MP_OKAY)
-       err = mp_add(&t1, R->x, &t1);
+       err = mp_add(&t1, x, &t1);
    if (err == MP_OKAY) {
        if (mp_cmp(&t1, modulus) != MP_LT)
            err = mp_sub(&t1, modulus, &t1);
@@ -883,20 +645,20 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
    }
    /* Y = 2Y */
    if (err == MP_OKAY)
-       err = mp_add(R->y, R->y, R->y);
+       err = mp_add(y, y, y);
    if (err == MP_OKAY) {
-       if (mp_cmp(R->y, modulus) != MP_LT)
-           err = mp_sub(R->y, modulus, R->y);
+       if (mp_cmp(y, modulus) != MP_LT)
+           err = mp_sub(y, modulus, y);
    }
    /* Y = Y * Y */
    if (err == MP_OKAY)
-       err = mp_sqr(R->y, R->y);
+       err = mp_sqr(y, y);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->y, modulus, *mp);
+       err = mp_montgomery_reduce(y, modulus, *mp);
 
    /* T2 = Y * Y */
    if (err == MP_OKAY)
-       err = mp_sqr(R->y, &t2);
+       err = mp_sqr(y, &t2);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t2, modulus, *mp);
 
@@ -910,59 +672,69 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
 
    /* Y = Y * X */
    if (err == MP_OKAY)
-       err = mp_mul(R->y, R->x, R->y);
+       err = mp_mul(y, x, y);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->y, modulus, *mp);
+       err = mp_montgomery_reduce(y, modulus, *mp);
 
    /* X  = T1 * T1 */
    if (err == MP_OKAY)
-       err = mp_sqr(&t1, R->x);
+       err = mp_sqr(&t1, x);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->x, modulus, *mp);
+       err = mp_montgomery_reduce(x, modulus, *mp);
 
    /* X = X - Y */
    if (err == MP_OKAY)
-       err = mp_sub(R->x, R->y, R->x);
+       err = mp_sub(x, y, x);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(R->x, 0) == MP_LT)
-           err = mp_add(R->x, modulus, R->x);
+       if (mp_cmp_d(x, 0) == MP_LT)
+           err = mp_add(x, modulus, x);
    }
    /* X = X - Y */
    if (err == MP_OKAY)
-       err = mp_sub(R->x, R->y, R->x);
+       err = mp_sub(x, y, x);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(R->x, 0) == MP_LT)
-           err = mp_add(R->x, modulus, R->x);
+       if (mp_cmp_d(x, 0) == MP_LT)
+           err = mp_add(x, modulus, x);
    }
    /* Y = Y - X */
    if (err == MP_OKAY)
-       err = mp_sub(R->y, R->x, R->y);
+       err = mp_sub(y, x, y);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(R->y, 0) == MP_LT)
-           err = mp_add(R->y, modulus, R->y);
+       if (mp_cmp_d(y, 0) == MP_LT)
+           err = mp_add(y, modulus, y);
    }
    /* Y = Y * T1 */
    if (err == MP_OKAY)
-       err = mp_mul(R->y, &t1, R->y);
+       err = mp_mul(y, &t1, y);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->y, modulus, *mp);
+       err = mp_montgomery_reduce(y, modulus, *mp);
 
    /* Y = Y - T2 */
    if (err == MP_OKAY)
-       err = mp_sub(R->y, &t2, R->y);
+       err = mp_sub(y, &t2, y);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(R->y, 0) == MP_LT)
-           err = mp_add(R->y, modulus, R->y);
+       if (mp_cmp_d(y, 0) == MP_LT)
+           err = mp_add(y, modulus, y);
    }
 
+#ifdef ALT_ECC_SIZE
+   if (err == MP_OKAY)
+       err = mp_copy(x, R->x);
+   if (err == MP_OKAY)
+       err = mp_copy(y, R->y);
+   if (err == MP_OKAY)
+       err = mp_copy(z, R->z);
+#endif
+
+#ifndef USE_FAST_MATH
    /* clean up */
    mp_clear(&t1);
    mp_clear(&t2);
+#endif
 
    return err;
 }
 
-#endif /* USE_FAST_MATH */
 
 /**
   Map a projective jacbobian point back to affine space
@@ -2762,7 +2534,7 @@ int wc_ecc_export_x963_ex(ecc_key* key, byte* out, word32* outLen,
 }
 #endif /* HAVE_ECC_KEY_EXPORT */
 
-/* is ec point on curve described by dp ? */
+/* is ecc point on curve described by dp ? */
 static int ecc_is_point(const ecc_set_type* dp, ecc_point* ecp, mp_int* prime)
 {
    mp_int b, t1, t2;
diff --git a/wolfcrypt/src/integer.c b/wolfcrypt/src/integer.c
index 9e9b3d01e..045effb9f 100644
--- a/wolfcrypt/src/integer.c
+++ b/wolfcrypt/src/integer.c
@@ -40,6 +40,10 @@
 
 #include <wolfssl/wolfcrypt/integer.h>
 
+#ifdef WOLFSSL_DEBUG_MATH
+    #include <stdio.h>
+#endif
+
 #ifndef NO_WOLFSSL_SMALL_STACK
     #ifndef WOLFSSL_SMALL_STACK
         #define WOLFSSL_SMALL_STACK
@@ -4628,7 +4632,8 @@ int mp_read_radix (mp_int * a, const char *str, int radix)
 }
 #endif /* HAVE_ECC */
 
-#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY)
+#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
+    defined(WOLFSSL_DEBUG_MATH)
 
 /* returns size of ASCII representation */
 int mp_radix_size (mp_int *a, int radix, int *size)
@@ -4739,7 +4744,36 @@ int mp_toradix (mp_int *a, char *str, int radix)
     return MP_OKAY;
 }
 
-#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) */
+#ifdef WOLFSSL_DEBUG_MATH
+void mp_dump(const char* desc, mp_int* a, byte verbose)
+{
+  char *buffer;
+  int size = a->alloc;
+
+  buffer = (char*)XMALLOC(size * 2, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (buffer == NULL) {
+    return;
+  }
+
+  printf("%s: ptr=%p, used=%d, sign=%d, size=%d, mpd=%d\n",
+    desc, a, a->used, a->sign, size, (int)sizeof(mp_digit));
+
+  mp_toradix(a, buffer, 16);
+  printf("  %s\n  ", buffer);
+
+  if (verbose) {
+    int i;
+    for(i=0; i<a->alloc * (int)sizeof(mp_digit); i++) {
+      printf("%02x ", *(((byte*)a->dp) + i));
+    }
+    printf("\n");
+  }
+
+  XFREE(buffer, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+}
+#endif /* WOLFSSL_DEBUG_MATH */
+
+#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || defined(WOLFSSL_DEBUG_MATH) */
 
 #endif /* USE_FAST_MATH */
 
diff --git a/wolfcrypt/src/rsa.c b/wolfcrypt/src/rsa.c
index 690a7c804..7dd775809 100644
--- a/wolfcrypt/src/rsa.c
+++ b/wolfcrypt/src/rsa.c
@@ -845,7 +845,9 @@ static int wc_RsaFunction(const byte* in, word32 inLen, byte* out,
             mp_clear(&tmpa);
             mp_clear(&tmpb);
 
-            if (ret != 0) return ret;
+            if (ret != 0) {
+                goto done;
+            }
 
         #endif   /* RSA_LOW_MEM */
     }
diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index 81372ab8c..258e31e7d 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -50,6 +50,10 @@
 #include <wolfssl/wolfcrypt/tfm.h>
 #include <wolfcrypt/src/asm.c>  /* will define asm MACROS or C ones */
 
+#ifdef WOLFSSL_DEBUG_MATH
+    #include <stdio.h>
+#endif
+
 
 /* math settings check */
 word32 CheckRunTimeSettings(void)
@@ -118,6 +122,8 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
   }
 
   c->used = x;
+
+  /* zero any excess digits on the destination that we didn't write to */
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
@@ -179,6 +185,8 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
      c->dp[x]  = (fp_digit)t;
      t         = (t >> DIGIT_BIT)&1;
    }
+
+  /* zero any excess digits on the destination that we didn't write to */
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
@@ -188,7 +196,9 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
 /* c = a * b */
 void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 {
-    int   y, yy;
+    int   y, yy, oldused;
+
+    oldused = C->used;
 
     y  = MAX(A->used, B->used);
     yy = MIN(A->used, B->used);
@@ -196,7 +206,7 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
     /* call generic if we're out of range */
     if (y + yy > FP_SIZE) {
        fp_mul_comba(A, B, C);
-       return ;
+       goto clean;
     }
 
     /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
@@ -205,98 +215,104 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
        if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications
     */
 
-#ifdef TFM_MUL3
+#if defined(TFM_MUL3) && FP_SIZE >= 6
         if (y <= 3) {
            fp_mul_comba3(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL4
+#if defined(TFM_MUL4) && FP_SIZE >= 8
         if (y == 4) {
            fp_mul_comba4(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL6
+#if defined(TFM_MUL6) && FP_SIZE >= 12
         if (y <= 6) {
            fp_mul_comba6(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL7
+#if defined(TFM_MUL7) && FP_SIZE >= 14
         if (y == 7) {
            fp_mul_comba7(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL8
+#if defined(TFM_MUL8) && FP_SIZE >= 16
         if (y == 8) {
            fp_mul_comba8(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL9
+#if defined(TFM_MUL9) && FP_SIZE >= 18
         if (y == 9) {
            fp_mul_comba9(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL12
+#if defined(TFM_MUL12) && FP_SIZE >= 24
         if (y <= 12) {
            fp_mul_comba12(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL17
+#if defined(TFM_MUL17) && FP_SIZE >= 34
         if (y <= 17) {
            fp_mul_comba17(A,B,C);
-           return;
+           goto clean;
         }
 #endif
 
-#ifdef TFM_SMALL_SET
+#if defined(TFM_SMALL_SET) && FP_SIZE >= 32
         if (y <= 16) {
            fp_mul_comba_small(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL20)
+#if defined(TFM_MUL20) && FP_SIZE >= 40
         if (y <= 20) {
            fp_mul_comba20(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL24)
+#if defined(TFM_MUL24) && FP_SIZE >= 48
         if (yy >= 16 && y <= 24) {
            fp_mul_comba24(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL28)
+#if defined(TFM_MUL28) && FP_SIZE >= 56
         if (yy >= 20 && y <= 28) {
            fp_mul_comba28(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL32)
+#if defined(TFM_MUL32) && FP_SIZE >= 64
         if (yy >= 24 && y <= 32) {
            fp_mul_comba32(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL48)
+#if defined(TFM_MUL48) && FP_SIZE >= 96
         if (yy >= 40 && y <= 48) {
-           fp_mul_comba48(A,B,C);
-           return;
+          fp_mul_comba48(A,B,C);
+          goto clean;
         }
 #endif
-#if defined(TFM_MUL64)
+#if defined(TFM_MUL64) && FP_SIZE >= 128
         if (yy >= 56 && y <= 64) {
            fp_mul_comba64(A,B,C);
-           return;
+           goto clean;
         }
 #endif
         fp_mul_comba(A,B,C);
+
+clean:
+    /* zero any excess digits on the destination that we didn't write to */
+    for (y = C->used; y < oldused; y++) {
+        C->dp[y] = 0;
+    }
 }
 
 void fp_mul_2(fp_int * a, fp_int * b)
@@ -340,9 +356,7 @@ void fp_mul_2(fp_int * a, fp_int * b)
       ++(b->used);
     }
 
-    /* now zero any excess digits on the destination
-     * that we didn't write to
-     */
+    /* zero any excess digits on the destination that we didn't write to */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
@@ -370,6 +384,8 @@ void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
       c->dp[c->used++] = (fp_digit) w;
       ++x;
    }
+
+   /* zero any excess digits on the destination that we didn't write to */
    for (; x < oldused; x++) {
       c->dp[x] = 0;
    }
@@ -627,9 +643,7 @@ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
   if (d != NULL) {
     fp_div_2d (&x, norm, &x, NULL);
 
-/* the following is a kludge, essentially we were seeing the right remainder but
-   with excess digits that should have been zero
- */
+    /* zero any excess digits on the destination that we didn't write to */
     for (i = b->used; i < x.used; i++) {
         x.dp[i] = 0;
     }
@@ -669,7 +683,7 @@ void fp_div_2(fp_int * a, fp_int * b)
       r = rr;
     }
 
-    /* zero excess digits */
+    /* zero any excess digits on the destination that we didn't write to */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
@@ -1049,9 +1063,14 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  */
 static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
-  fp_int   M[64], res;
+  fp_int   res;
   fp_digit buf, mp;
   int      err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+#ifdef WOLFSSL_SMALL_STACK
+  fp_int  *M;
+#else
+  fp_int   M[64];
+#endif
 
   /* find window size */
   x = fp_count_bits (X);
@@ -1067,15 +1086,23 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
     winsize = 6;
   }
 
-  /* init M array */
-  for(x = 0; x < (int)(sizeof(M)/sizeof(fp_int)); x++)
-    fp_init(&M[x]);
-
   /* now setup montgomery  */
   if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
      return err;
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  /* only allocate space for what's needed */
+  M = (fp_int*)XMALLOC(sizeof(fp_int)*(1 << winsize), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (M == NULL) {
+     return FP_MEM;
+  }
+#endif
+
+  /* init M array */
+  for(x = 0; x < (1 << winsize); x++)
+    fp_init(&M[x]);
+
   /* setup result */
   fp_init(&res);
 
@@ -1083,7 +1110,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
    *
    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
    *
-   * The first half of the table is not computed though accept for M[0] and M[1]
+   * The first half of the table is not computed though except for M[0] and M[1]
    */
 
    /* now we need R mod m */
@@ -1202,10 +1229,15 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 
   /* swap res with Y */
   fp_copy (&res, Y);
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
   return FP_OKAY;
 }
 
-#endif
+#endif /* TFM_TIMING_RESISTANT */
 
 int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
@@ -1267,105 +1299,114 @@ void fp_2expt(fp_int *a, int b)
 /* b = a*a  */
 void fp_sqr(fp_int *A, fp_int *B)
 {
-    int y = A->used;
+    int y, oldused;
+
+    oldused = B->used;
+    y = A->used;
 
     /* call generic if we're out of range */
     if (y + y > FP_SIZE) {
        fp_sqr_comba(A, B);
-       return ;
+       goto clean;
     }
 
-#if defined(TFM_SQR3)
+#if defined(TFM_SQR3) && FP_SIZE >= 6
         if (y <= 3) {
            fp_sqr_comba3(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR4)
+#if defined(TFM_SQR4) && FP_SIZE >= 8
         if (y == 4) {
            fp_sqr_comba4(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR6)
+#if defined(TFM_SQR6) && FP_SIZE >= 12
         if (y <= 6) {
            fp_sqr_comba6(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR7)
+#if defined(TFM_SQR7) && FP_SIZE >= 14
         if (y == 7) {
            fp_sqr_comba7(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR8)
+#if defined(TFM_SQR8) && FP_SIZE >= 16
         if (y == 8) {
            fp_sqr_comba8(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR9)
+#if defined(TFM_SQR9) && FP_SIZE >= 18
         if (y == 9) {
            fp_sqr_comba9(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR12)
+#if defined(TFM_SQR12) && FP_SIZE >= 24
         if (y <= 12) {
            fp_sqr_comba12(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR17)
+#if defined(TFM_SQR17) && FP_SIZE >= 34
         if (y <= 17) {
            fp_sqr_comba17(A,B);
-           return;
+           goto clean;
         }
 #endif
 #if defined(TFM_SMALL_SET)
         if (y <= 16) {
            fp_sqr_comba_small(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR20)
+#if defined(TFM_SQR20) && FP_SIZE >= 40
         if (y <= 20) {
            fp_sqr_comba20(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR24)
+#if defined(TFM_SQR24) && FP_SIZE >= 48
         if (y <= 24) {
            fp_sqr_comba24(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR28)
+#if defined(TFM_SQR28) && FP_SIZE >= 56
         if (y <= 28) {
            fp_sqr_comba28(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR32)
+#if defined(TFM_SQR32) && FP_SIZE >= 64
         if (y <= 32) {
            fp_sqr_comba32(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR48)
+#if defined(TFM_SQR48) && FP_SIZE >= 96
         if (y <= 48) {
            fp_sqr_comba48(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR64)
+#if defined(TFM_SQR64) && FP_SIZE >= 128
         if (y <= 64) {
            fp_sqr_comba64(A,B);
-           return;
+           goto clean;
         }
 #endif
        fp_sqr_comba(A, B);
+
+clean:
+  /* zero any excess digits on the destination that we didn't write to */
+  for (y = B->used; y < oldused; y++) {
+    B->dp[y] = 0;
+  }
 }
 
 /* generic comba squarer */
@@ -1513,7 +1554,7 @@ int fp_cmp_mag(fp_int *a, fp_int *b)
    return FP_EQ;
 }
 
-/* setups the montgomery reduction */
+/* sets up the montgomery reduction */
 int fp_montgomery_setup(fp_int *a, fp_digit *rho)
 {
   fp_digit x, b;
@@ -1612,7 +1653,7 @@ static void fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
 
 
    /* now zero the buff */
-   XMEMSET(c, 0, sizeof c);
+   XMEMSET(c, 0, sizeof(c));
    pa = m->used;
 
    /* copy the input */
@@ -1652,7 +1693,8 @@ static void fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
      *tmpm++ = *_c++;
   }
 
-  for (; x < oldused; x++)   {
+  /* zero any excess digits on the destination that we didn't write to */
+  for (; x < oldused; x++) {
      *tmpm++ = 0;
   }
 
@@ -1691,7 +1733,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
 
 
    /* now zero the buff */
-   XMEMSET(c, 0, sizeof c);
+   XMEMSET(c, 0, sizeof(c));
    pa = m->used;
 
    /* copy the input */
@@ -1733,7 +1775,8 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
      *tmpm++ = *_c++;
   }
 
-  for (; x < oldused; x++)   {
+  /* zero any excess digits on the destination that we didn't write to */
+  for (; x < oldused; x++) {
      *tmpm++ = 0;
   }
 
@@ -1829,7 +1872,7 @@ void fp_set(fp_int *a, fp_digit b)
    a->used  = a->dp[0] ? 1 : 0;
 }
 
-/* chek if a bit is set */
+/* check if a bit is set */
 int fp_is_bit_set (fp_int *a, fp_digit b)
 {
     fp_digit i;
@@ -2177,13 +2220,20 @@ int mp_div_2d(fp_int* a, int b, fp_int* c, fp_int* d)
 }
 
 #ifdef ALT_ECC_SIZE
-void fp_copy(fp_int *a, fp_int* b)
+void fp_copy(fp_int *a, fp_int *b)
 {
     if (a != b && b->size >= a->used) {
+        int x, oldused;
+        oldused = b->used;
         b->used = a->used;
         b->sign = a->sign;
 
         XMEMCPY(b->dp, a->dp, a->used * sizeof(fp_digit));
+
+        /* zero any excess digits on the destination that we didn't write to */
+        for (x = b->used; x < oldused; x++) {
+            b->dp[x] = 0;
+        }
     }
 }
 
@@ -2196,49 +2246,39 @@ void fp_init_copy(fp_int *a, fp_int* b)
 }
 #endif
 
-/* fast math conversion */
+/* fast math wrappers */
 int mp_copy(fp_int* a, fp_int* b)
 {
     fp_copy(a, b);
     return MP_OKAY;
 }
 
-
-/* fast math conversion */
 int mp_isodd(mp_int* a)
 {
     return fp_isodd(a);
 }
 
-
-/* fast math conversion */
 int mp_iszero(mp_int* a)
 {
     return fp_iszero(a);
 }
 
 
-/* fast math conversion */
 int mp_count_bits (mp_int* a)
 {
     return fp_count_bits(a);
 }
 
-
 int mp_leading_bit (mp_int* a)
 {
     return fp_leading_bit(a);
 }
 
-
-/* fast math conversion */
 void mp_rshb (mp_int* a, int x)
 {
     fp_rshb(a, x);
 }
 
-
-/* fast math wrappers */
 int mp_set_int(mp_int *a, mp_digit b)
 {
     fp_set(a, b);
@@ -2282,7 +2322,8 @@ int mp_montgomery_calc_normalization(mp_int *a, mp_int *b)
 #endif /* WOLFSSL_KEYGEN || HAVE_ECC */
 
 
-#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY)
+#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
+    defined(WOLFSSL_DEBUG_MATH)
 
 static const int lnz[16] = {
    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
@@ -2429,7 +2470,7 @@ int mp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
    return fp_mod_d(a, b, c);
 }
 
-#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) */
+#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || defined(WOLFSSL_DEBUG_MATH) */
 
 #ifdef WOLFSSL_KEY_GEN
 
@@ -2872,7 +2913,8 @@ int mp_cnt_lsb(fp_int* a)
 
 #endif /* HAVE_ECC */
 
-#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY)
+#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
+    defined(WOLFSSL_DEBUG_MATH)
 
 /* returns size of ASCII representation */
 int mp_radix_size (mp_int *a, int radix, int *size)
@@ -2980,7 +3022,32 @@ int mp_toradix (mp_int *a, char *str, int radix)
     return FP_OKAY;
 }
 
-#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) */
+#ifdef WOLFSSL_DEBUG_MATH
+void mp_dump(const char* desc, mp_int* a, byte verbose)
+{
+  char buffer[FP_SIZE * sizeof(fp_digit) * 2];
+  int size = FP_SIZE;
+
+#ifdef ALT_ECC_SIZE
+  size = a->size;
+#endif
+
+  printf("%s: ptr=%p, used=%d, sign=%d, size=%d, fpd=%d\n",
+    desc, a, a->used, a->sign, size, (int)sizeof(fp_digit));
+
+  mp_toradix(a, buffer, 16);
+  printf("  %s\n  ", buffer);
+
+  if (verbose) {
+    int i;
+    for(i=0; i<size * (int)sizeof(fp_digit); i++) {
+      printf("%x ", *(((byte*)a->dp) + i));
+    }
+    printf("\n");
+  }
+}
+#endif /* WOLFSSL_DEBUG_MATH */
+
+#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || defined(WOLFSSL_DEBUG_MATH) */
 
 #endif /* USE_FAST_MATH */
-
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index 32da38dc7..d5d114c18 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -6560,7 +6560,7 @@ static int ecc_test_key_gen(WC_RNG* rng, int keySize)
 
     ret = wc_ecc_check_key(&userA);
     if (ret != 0)
-        return -1024;
+        return -1023;
 
     derSz = wc_EccKeyToDer(&userA, der, FOURK_BUF);
     if (derSz < 0) {
@@ -6621,8 +6621,8 @@ static int ecc_test_curve_size(WC_RNG* rng, int keySize, int testVerifyCount,
     int testCompressedKey)
 {
 #ifdef BENCH_EMBEDDED
-    byte    sharedA[32];
-    byte    sharedB[32];
+    byte    sharedA[128]; /* Needs to be at least keySize */
+    byte    sharedB[128]; /* Needs to be at least keySize */
 #else
     byte    sharedA[1024];
     byte    sharedB[1024];
@@ -6652,7 +6652,7 @@ static int ecc_test_curve_size(WC_RNG* rng, int keySize, int testVerifyCount,
 
     ret = wc_ecc_check_key(&userA);
     if (ret != 0)
-        return -1024;
+        return -1023;
 
     ret = wc_ecc_make_key(rng, keySize, &userB);
     if (ret != 0)
@@ -6732,6 +6732,7 @@ static int ecc_test_curve_size(WC_RNG* rng, int keySize, int testVerifyCount,
     for (i = 0; i < (int)sizeof(digest); i++) {
         digest[i] = 0;
     }
+    digest[i-1] = 1; /* Set last digit to non-zero value */
 
     x = sizeof(sig);
     ret = wc_ecc_sign_hash(digest, sizeof(digest), sig, &x, rng, &userA);
@@ -6801,12 +6802,14 @@ static int ecc_test_curve(WC_RNG* rng, int keySize)
     ret = ecc_test_curve_size(rng, keySize, ECC_TEST_VERIFY_COUNT,
                                                         testCompressedKey);
     if (ret < 0) {
+        printf("ecc_test_curve_size %d failed!: %d\n", keySize, ret);
         return ret;
     }
 
     #ifdef HAVE_ECC_VECTOR_TEST
         ret = ecc_test_vector(keySize);
         if (ret < 0) {
+            printf("ecc_test_vector %d failed!: %d\n", keySize, ret);
             return ret;
         }
     #endif
@@ -6814,6 +6817,7 @@ static int ecc_test_curve(WC_RNG* rng, int keySize)
     #ifdef WOLFSSL_KEY_GEN
         ret = ecc_test_key_gen(rng, keySize);
         if (ret < 0) {
+            printf("ecc_test_key_gen %d failed!: %d\n", keySize, ret);
             return ret;
         }
     #endif
diff --git a/wolfssl/wolfcrypt/ecc.h b/wolfssl/wolfcrypt/ecc.h
index b952e68b9..e67a49720 100644
--- a/wolfssl/wolfcrypt/ecc.h
+++ b/wolfssl/wolfcrypt/ecc.h
@@ -61,6 +61,27 @@ typedef struct {
 } ecc_set_type;
 
 
+/* Determine max ECC bits based on enabled curves */
+#if defined(HAVE_ECC521) || defined(HAVE_ALL_CURVES)
+    #define MAX_ECC_BITS    521
+#elif defined(HAVE_ECC384)
+    #define MAX_ECC_BITS    384
+#elif defined(HAVE_ECC224)
+    #define MAX_ECC_BITS    224
+#elif !defined(NO_ECC256)
+    #define MAX_ECC_BITS    256
+#elif defined(HAVE_ECC192)
+    #define MAX_ECC_BITS    192
+#elif defined(HAVE_ECC160)
+    #define MAX_ECC_BITS    160
+#elif defined(HAVE_ECC128)
+    #define MAX_ECC_BITS    128
+#elif defined(HAVE_ECC112)
+    #define MAX_ECC_BITS    112
+#endif
+
+
+
 #ifdef ALT_ECC_SIZE
 
 /* Note on ALT_ECC_SIZE:
@@ -90,14 +111,26 @@ typedef struct {
     #error USE_FAST_MATH must be defined to use ALT_ECC_SIZE
 #endif
 
+/* determine max bits required for ECC math */
 #ifndef FP_MAX_BITS_ECC
-    #define FP_MAX_BITS_ECC           528
+    /* check alignment */
+    #if ((MAX_ECC_BITS * 2) % DIGIT_BIT) == 0
+        /* max bits is double */
+        #define FP_MAX_BITS_ECC     (MAX_ECC_BITS * 2)
+    #else
+        /* max bits is doubled, plus one digit of fudge */
+        #define FP_MAX_BITS_ECC     ((MAX_ECC_BITS * 2) + DIGIT_BIT)
+    #endif
+#else
+    /* verify alignment */
+    #if FP_MAX_BITS_ECC % CHAR_BIT
+       #error FP_MAX_BITS_ECC must be a multiple of CHAR_BIT
+    #endif
 #endif
-#define FP_MAX_SIZE_ECC           (FP_MAX_BITS_ECC+(8*DIGIT_BIT))
-#if FP_MAX_BITS_ECC % CHAR_BIT
-   #error FP_MAX_BITS_ECC must be a multiple of CHAR_BIT
-#endif
-#define FP_SIZE_ECC    (FP_MAX_SIZE_ECC/DIGIT_BIT)
+
+/* determine buffer size */
+#define FP_SIZE_ECC    (FP_MAX_BITS_ECC/DIGIT_BIT)
+
 
 /* This needs to match the size of the fp_int struct, except the
  * fp_digit array will be shorter. */
diff --git a/wolfssl/wolfcrypt/integer.h b/wolfssl/wolfcrypt/integer.h
index a0ca3c15e..2b38601cb 100644
--- a/wolfssl/wolfcrypt/integer.h
+++ b/wolfssl/wolfcrypt/integer.h
@@ -310,6 +310,12 @@ int mp_init_multi(mp_int* a, mp_int* b, mp_int* c, mp_int* d, mp_int* e,
 int mp_toradix (mp_int *a, char *str, int radix);
 int mp_radix_size (mp_int * a, int radix, int *size);
 
+#ifdef WOLFSSL_DEBUG_MATH
+    void mp_dump(const char* desc, mp_int* a, byte verbose);
+#else
+    #define mp_dump(desc, a, verbose)
+#endif
+
 #if defined(HAVE_ECC) || defined(WOLFSSL_KEY_GEN)
     int mp_sqrmod(mp_int* a, mp_int* b, mp_int* c);
 #endif
diff --git a/wolfssl/wolfcrypt/tfm.h b/wolfssl/wolfcrypt/tfm.h
index c0e05e4ae..f86a7e52f 100644
--- a/wolfssl/wolfcrypt/tfm.h
+++ b/wolfssl/wolfcrypt/tfm.h
@@ -211,6 +211,7 @@
 #if defined(FP_64BIT)
    /* for GCC only on supported platforms */
    typedef unsigned long long fp_digit;   /* 64bit, 128 uses mode(TI) below */
+   #define SIZEOF_FP_DIGIT 8
    typedef unsigned long      fp_word __attribute__ ((mode(TI)));
 #else
    #if defined(_MSC_VER) || defined(__BORLANDC__)
@@ -221,12 +222,14 @@
 
    #ifndef NO_64BIT
       typedef unsigned int       fp_digit;
+      #define SIZEOF_FP_DIGIT 4
       typedef ulong64            fp_word;
       #define FP_32BIT
    #else
       /* some procs like coldfire prefer not to place multiply into 64bit type
          even though it exists */
       typedef unsigned short     fp_digit;
+      #define SIZEOF_FP_DIGIT 2
       typedef unsigned int       fp_word;
    #endif
 #endif
@@ -234,7 +237,7 @@
 #endif /* WOLFSSL_BIGINT_TYPES */
 
 /* # of digits this is */
-#define DIGIT_BIT  (int)((CHAR_BIT) * sizeof(fp_digit))
+#define DIGIT_BIT   ((CHAR_BIT) * SIZEOF_FP_DIGIT)
 
 /* Max size of any number in bits.  Basically the largest size you will be
  * multiplying should be half [or smaller] of FP_MAX_SIZE-four_digit
@@ -548,103 +551,38 @@ void fp_reverse(unsigned char *s, int len);
 
 void fp_mul_comba(fp_int *a, fp_int *b, fp_int *c);
 
-#ifdef TFM_SMALL_SET
 void fp_mul_comba_small(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
-#ifdef TFM_MUL3
 void fp_mul_comba3(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL4
 void fp_mul_comba4(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL6
 void fp_mul_comba6(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL7
 void fp_mul_comba7(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL8
 void fp_mul_comba8(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL9
 void fp_mul_comba9(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL12
 void fp_mul_comba12(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL17
 void fp_mul_comba17(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
-#ifdef TFM_MUL20
 void fp_mul_comba20(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL24
 void fp_mul_comba24(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL28
 void fp_mul_comba28(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL32
 void fp_mul_comba32(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL48
 void fp_mul_comba48(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL64
 void fp_mul_comba64(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
 void fp_sqr_comba(fp_int *a, fp_int *b);
-
-#ifdef TFM_SMALL_SET
 void fp_sqr_comba_small(fp_int *a, fp_int *b);
-#endif
-
-#ifdef TFM_SQR3
 void fp_sqr_comba3(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR4
 void fp_sqr_comba4(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR6
 void fp_sqr_comba6(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR7
 void fp_sqr_comba7(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR8
 void fp_sqr_comba8(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR9
 void fp_sqr_comba9(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR12
 void fp_sqr_comba12(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR17
 void fp_sqr_comba17(fp_int *a, fp_int *b);
-#endif
-
-#ifdef TFM_SQR20
 void fp_sqr_comba20(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR24
 void fp_sqr_comba24(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR28
 void fp_sqr_comba28(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR32
 void fp_sqr_comba32(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR48
 void fp_sqr_comba48(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR64
 void fp_sqr_comba64(fp_int *a, fp_int *b);
-#endif
+
 /*extern const char *fp_s_rmap;*/
 
 
@@ -707,6 +645,12 @@ void mp_rshb(mp_int *a, int x);
 int mp_toradix (mp_int *a, char *str, int radix);
 int mp_radix_size (mp_int * a, int radix, int *size);
 
+#ifdef WOLFSSL_DEBUG_MATH
+    void mp_dump(const char* desc, mp_int* a, byte verbose);
+#else
+    #define mp_dump(desc, a, verbose)
+#endif
+
 #ifdef HAVE_ECC
     int mp_read_radix(mp_int* a, const char* str, int radix);
     void mp_set(fp_int *a, fp_digit b);