diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index be71639f6..dd1aa14b4 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -118,6 +118,8 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
   }
 
   c->used = x;
+
+  /* zero any excess digits on the destination that we didn't write to */
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
@@ -179,6 +181,8 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
      c->dp[x]  = (fp_digit)t;
      t         = (t >> DIGIT_BIT)&1;
    }
+
+  /* zero any excess digits on the destination that we didn't write to */
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
@@ -188,7 +192,9 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
 /* c = a * b */
 void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 {
-    int   y, yy;
+    int   y, yy, oldused;
+
+    oldused = C->used;
 
     y  = MAX(A->used, B->used);
     yy = MIN(A->used, B->used);
@@ -196,7 +202,7 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
     /* call generic if we're out of range */
     if (y + yy > FP_SIZE) {
        fp_mul_comba(A, B, C);
-       return ;
+       goto clean;
     }
 
     /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
@@ -205,98 +211,104 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
        if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications
     */
 
-#ifdef TFM_MUL3
+#if defined(TFM_MUL3) && FP_SIZE >= 6
         if (y <= 3) {
            fp_mul_comba3(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL4
+#if defined(TFM_MUL4) && FP_SIZE >= 8
         if (y == 4) {
            fp_mul_comba4(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL6
+#if defined(TFM_MUL6) && FP_SIZE >= 12
         if (y <= 6) {
            fp_mul_comba6(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL7
+#if defined(TFM_MUL7) && FP_SIZE >= 14
         if (y == 7) {
            fp_mul_comba7(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL8
+#if defined(TFM_MUL8) && FP_SIZE >= 16
         if (y == 8) {
            fp_mul_comba8(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL9
+#if defined(TFM_MUL9) && FP_SIZE >= 18
         if (y == 9) {
            fp_mul_comba9(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL12
+#if defined(TFM_MUL12) && FP_SIZE >= 24
         if (y <= 12) {
            fp_mul_comba12(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL17
+#if defined(TFM_MUL17) && FP_SIZE >= 34
         if (y <= 17) {
            fp_mul_comba17(A,B,C);
-           return;
+           goto clean;
         }
 #endif
 
-#ifdef TFM_SMALL_SET
+#if defined(TFM_SMALL_SET) && FP_SIZE >= 32
         if (y <= 16) {
            fp_mul_comba_small(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL20)
+#if defined(TFM_MUL20) && FP_SIZE >= 40
         if (y <= 20) {
            fp_mul_comba20(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL24)
+#if defined(TFM_MUL24) && FP_SIZE >= 48
         if (yy >= 16 && y <= 24) {
            fp_mul_comba24(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL28)
+#if defined(TFM_MUL28) && FP_SIZE >= 56
         if (yy >= 20 && y <= 28) {
            fp_mul_comba28(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL32)
+#if defined(TFM_MUL32) && FP_SIZE >= 64
         if (yy >= 24 && y <= 32) {
            fp_mul_comba32(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL48)
+#if defined(TFM_MUL48) && FP_SIZE >= 96
         if (yy >= 40 && y <= 48) {
-           fp_mul_comba48(A,B,C);
-           return;
+          fp_mul_comba48(A,B,C);
+          goto clean;
         }
 #endif
-#if defined(TFM_MUL64)
+#if defined(TFM_MUL64) && FP_SIZE >= 128
         if (yy >= 56 && y <= 64) {
            fp_mul_comba64(A,B,C);
-           return;
+           goto clean;
         }
 #endif
         fp_mul_comba(A,B,C);
+
+clean:
+    /* zero any excess digits on the destination that we didn't write to */
+    for (y = C->used; y < oldused; y++) {
+        C->dp[y] = 0;
+    }
 }
 
 void fp_mul_2(fp_int * a, fp_int * b)
@@ -340,9 +352,7 @@ void fp_mul_2(fp_int * a, fp_int * b)
       ++(b->used);
     }
 
-    /* now zero any excess digits on the destination
-     * that we didn't write to
-     */
+    /* zero any excess digits on the destination that we didn't write to */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
@@ -370,6 +380,8 @@ void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
       c->dp[c->used++] = (fp_digit) w;
       ++x;
    }
+
+   /* zero any excess digits on the destination that we didn't write to */
    for (; x < oldused; x++) {
       c->dp[x] = 0;
    }
@@ -627,9 +639,7 @@ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
   if (d != NULL) {
     fp_div_2d (&x, norm, &x, NULL);
 
-/* the following is a kludge, essentially we were seeing the right remainder but
-   with excess digits that should have been zero
- */
+    /* zero any excess digits on the destination that we didn't write to */
     for (i = b->used; i < x.used; i++) {
         x.dp[i] = 0;
     }
@@ -669,7 +679,7 @@ void fp_div_2(fp_int * a, fp_int * b)
       r = rr;
     }
 
-    /* zero excess digits */
+    /* zero any excess digits on the destination that we didn't write to */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
@@ -1267,105 +1277,114 @@ void fp_2expt(fp_int *a, int b)
 /* b = a*a  */
 void fp_sqr(fp_int *A, fp_int *B)
 {
-    int y = A->used;
+    int y, oldused;
+
+    oldused = B->used;
+    y = A->used;
 
     /* call generic if we're out of range */
     if (y + y > FP_SIZE) {
        fp_sqr_comba(A, B);
-       return ;
+       goto clean;
     }
 
-#if defined(TFM_SQR3)
+#if defined(TFM_SQR3) && FP_SIZE >= 6
         if (y <= 3) {
            fp_sqr_comba3(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR4)
+#if defined(TFM_SQR4) && FP_SIZE >= 8
         if (y == 4) {
            fp_sqr_comba4(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR6)
+#if defined(TFM_SQR6) && FP_SIZE >= 12
         if (y <= 6) {
            fp_sqr_comba6(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR7)
+#if defined(TFM_SQR7) && FP_SIZE >= 14
         if (y == 7) {
            fp_sqr_comba7(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR8)
+#if defined(TFM_SQR8) && FP_SIZE >= 16
         if (y == 8) {
            fp_sqr_comba8(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR9)
+#if defined(TFM_SQR9) && FP_SIZE >= 18
         if (y == 9) {
            fp_sqr_comba9(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR12)
+#if defined(TFM_SQR12) && FP_SIZE >= 24
         if (y <= 12) {
            fp_sqr_comba12(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR17)
+#if defined(TFM_SQR17) && FP_SIZE >= 34
         if (y <= 17) {
            fp_sqr_comba17(A,B);
-           return;
+           goto clean;
         }
 #endif
 #if defined(TFM_SMALL_SET)
         if (y <= 16) {
            fp_sqr_comba_small(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR20)
+#if defined(TFM_SQR20) && FP_SIZE >= 40
         if (y <= 20) {
            fp_sqr_comba20(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR24)
+#if defined(TFM_SQR24) && FP_SIZE >= 48
         if (y <= 24) {
            fp_sqr_comba24(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR28)
+#if defined(TFM_SQR28) && FP_SIZE >= 56
         if (y <= 28) {
            fp_sqr_comba28(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR32)
+#if defined(TFM_SQR32) && FP_SIZE >= 64
         if (y <= 32) {
            fp_sqr_comba32(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR48)
+#if defined(TFM_SQR48) && FP_SIZE >= 96
         if (y <= 48) {
            fp_sqr_comba48(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR64)
+#if defined(TFM_SQR64) && FP_SIZE >= 128
         if (y <= 64) {
            fp_sqr_comba64(A,B);
-           return;
+           goto clean;
         }
 #endif
        fp_sqr_comba(A, B);
+
+clean:
+  /* zero any excess digits on the destination that we didn't write to */
+  for (y = B->used; y < oldused; y++) {
+    B->dp[y] = 0;
+  }
 }
 
 /* generic comba squarer */
@@ -1652,7 +1671,8 @@ static void fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
      *tmpm++ = *_c++;
   }
 
-  for (; x < oldused; x++)   {
+  /* zero any excess digits on the destination that we didn't write to */
+  for (; x < oldused; x++) {
      *tmpm++ = 0;
   }
 
@@ -1733,7 +1753,8 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
      *tmpm++ = *_c++;
   }
 
-  for (; x < oldused; x++)   {
+  /* zero any excess digits on the destination that we didn't write to */
+  for (; x < oldused; x++) {
      *tmpm++ = 0;
   }
 
diff --git a/wolfssl/wolfcrypt/tfm.h b/wolfssl/wolfcrypt/tfm.h
index c0e05e4ae..ce633b43d 100644
--- a/wolfssl/wolfcrypt/tfm.h
+++ b/wolfssl/wolfcrypt/tfm.h
@@ -211,6 +211,7 @@
 #if defined(FP_64BIT)
    /* for GCC only on supported platforms */
    typedef unsigned long long fp_digit;   /* 64bit, 128 uses mode(TI) below */
+   #define SIZEOF_FP_DIGIT 8
    typedef unsigned long      fp_word __attribute__ ((mode(TI)));
 #else
    #if defined(_MSC_VER) || defined(__BORLANDC__)
@@ -221,12 +222,14 @@
 
    #ifndef NO_64BIT
       typedef unsigned int       fp_digit;
+      #define SIZEOF_FP_DIGIT 4
       typedef ulong64            fp_word;
       #define FP_32BIT
    #else
       /* some procs like coldfire prefer not to place multiply into 64bit type
          even though it exists */
       typedef unsigned short     fp_digit;
+      #define SIZEOF_FP_DIGIT 2
       typedef unsigned int       fp_word;
    #endif
 #endif
@@ -234,7 +237,7 @@
 #endif /* WOLFSSL_BIGINT_TYPES */
 
 /* # of digits this is */
-#define DIGIT_BIT  (int)((CHAR_BIT) * sizeof(fp_digit))
+#define DIGIT_BIT   ((CHAR_BIT) * SIZEOF_FP_DIGIT)
 
 /* Max size of any number in bits.  Basically the largest size you will be
  * multiplying should be half [or smaller] of FP_MAX_SIZE-four_digit
@@ -548,103 +551,38 @@ void fp_reverse(unsigned char *s, int len);
 
 void fp_mul_comba(fp_int *a, fp_int *b, fp_int *c);
 
-#ifdef TFM_SMALL_SET
 void fp_mul_comba_small(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
-#ifdef TFM_MUL3
 void fp_mul_comba3(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL4
 void fp_mul_comba4(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL6
 void fp_mul_comba6(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL7
 void fp_mul_comba7(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL8
 void fp_mul_comba8(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL9
 void fp_mul_comba9(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL12
 void fp_mul_comba12(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL17
 void fp_mul_comba17(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
-#ifdef TFM_MUL20
 void fp_mul_comba20(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL24
 void fp_mul_comba24(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL28
 void fp_mul_comba28(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL32
 void fp_mul_comba32(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL48
 void fp_mul_comba48(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL64
 void fp_mul_comba64(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
 void fp_sqr_comba(fp_int *a, fp_int *b);
-
-#ifdef TFM_SMALL_SET
 void fp_sqr_comba_small(fp_int *a, fp_int *b);
-#endif
-
-#ifdef TFM_SQR3
 void fp_sqr_comba3(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR4
 void fp_sqr_comba4(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR6
 void fp_sqr_comba6(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR7
 void fp_sqr_comba7(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR8
 void fp_sqr_comba8(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR9
 void fp_sqr_comba9(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR12
 void fp_sqr_comba12(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR17
 void fp_sqr_comba17(fp_int *a, fp_int *b);
-#endif
-
-#ifdef TFM_SQR20
 void fp_sqr_comba20(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR24
 void fp_sqr_comba24(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR28
 void fp_sqr_comba28(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR32
 void fp_sqr_comba32(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR48
 void fp_sqr_comba48(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR64
 void fp_sqr_comba64(fp_int *a, fp_int *b);
-#endif
+
 /*extern const char *fp_s_rmap;*/