From 9f0fa7500f553e19ba904129aeef70ecb8499103 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:14:30 -0700
Subject: [PATCH 01/16] Added configure "--disable-eccshamir" option.

---
 configure.ac | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/configure.ac b/configure.ac
index a46fcc17b..64c4231c8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -731,6 +731,14 @@ then
 ECC_DEFAULT=yes
 fi
 
+# ECC Shamir
+AC_ARG_ENABLE([ecc],
+    [AS_HELP_STRING([--enable-eccshamir],[Enable ECC Shamir (default: enabled on x86_64)])],
+    [ ENABLED_ECC_SHAMIR=$enableval ],
+    [ ENABLED_ECC_SHAMIR=$ECC_DEFAULT ]
+    )
+
+
 # ECC
 AC_ARG_ENABLE([ecc],
     [AS_HELP_STRING([--enable-ecc],[Enable ECC (default: enabled on x86_64)])],
@@ -751,7 +759,11 @@ fi
 
 if test "$ENABLED_ECC" = "yes"
 then
-    AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
+    AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
+    if test "$ENABLED_ECC_SHAMIR" = "yes"
+    then
+        AM_CFLAGS="$AM_CFLAGS -DECC_SHAMIR"
+    fi
 fi
 
 AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
@@ -1961,8 +1973,13 @@ then
     if test "x$ENABLED_ECC" = "xno"
     then
         ENABLED_ECC="yes"
-        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
+        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
         AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
+        
+        if test "$ENABLED_ECC_SHAMIR" = "yes"
+        then
+            AM_CFLAGS="$AM_CFLAGS -DECC_SHAMIR"
+        fi
     fi
     if test "x$ENABLED_OPENSSLEXTRA" = "xno"
     then
@@ -2029,8 +2046,13 @@ then
     if test "x$ENABLED_ECC" = "xno"
     then
         ENABLED_ECC="yes"
-        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
+        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
         AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
+        
+        if test "$ENABLED_ECC_SHAMIR" = "yes"
+        then
+            AM_CFLAGS="$AM_CFLAGS -DECC_SHAMIR"
+        fi
     fi
     if test "x$ENABLED_PKCALLBACKS" = "xno"
     then
@@ -2122,8 +2144,13 @@ then
     then
         ENABLED_OPENSSLEXTRA="yes"
         ENABLED_ECC="yes"
-        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
+        AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
         AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
+        
+        if test "$ENABLED_ECC_SHAMIR" = "yes"
+        then
+            AM_CFLAGS="$AM_CFLAGS -DECC_SHAMIR"
+        fi
     fi
 
     AM_CFLAGS="$AM_CFLAGS -DHAVE_STUNNEL -DWOLFSSL_ALWAYS_VERIFY_CB"

From 9001036e0926dfa2d4352b2aa3330901e6839495 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:14:59 -0700
Subject: [PATCH 02/16] Fixes memory leak in the wc_RsaFunction if failure
 happens when using normal math (not fast math) and RSA_LOW_MEM is not
 defined.

---
 wolfcrypt/src/rsa.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wolfcrypt/src/rsa.c b/wolfcrypt/src/rsa.c
index 690a7c804..7dd775809 100644
--- a/wolfcrypt/src/rsa.c
+++ b/wolfcrypt/src/rsa.c
@@ -845,7 +845,9 @@ static int wc_RsaFunction(const byte* in, word32 inLen, byte* out,
             mp_clear(&tmpa);
             mp_clear(&tmpb);
 
-            if (ret != 0) return ret;
+            if (ret != 0) {
+                goto done;
+            }
 
         #endif   /* RSA_LOW_MEM */
     }

From 7c3fbd76440a9f8e258f94ecd76c62d086df3c57 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:15:38 -0700
Subject: [PATCH 03/16] Fix for fp_copy() when used with ALT_ECC_SIZE so any
 excess digits on the destination that we didn't write to are set to zero.

---
 wolfcrypt/src/tfm.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index 81372ab8c..be71639f6 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -2177,13 +2177,20 @@ int mp_div_2d(fp_int* a, int b, fp_int* c, fp_int* d)
 }
 
 #ifdef ALT_ECC_SIZE
-void fp_copy(fp_int *a, fp_int* b)
+void fp_copy(fp_int *a, fp_int *b)
 {
     if (a != b && b->size >= a->used) {
+        int x, oldused;
+        oldused = b->used;
         b->used = a->used;
         b->sign = a->sign;
 
         XMEMCPY(b->dp, a->dp, a->used * sizeof(fp_digit));
+
+        /* zero any excess digits on the destination that we didn't write to */
+        for (x = b->used; x < oldused; x++) {
+            b->dp[x] = 0;
+        }
     }
 }
 

From a5d27853fa8d7cd62ba1d26eeaa6219d99d68b5a Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:19:24 -0700
Subject: [PATCH 04/16] Fixes to fp_mul and fp_div to clear any excess digits
 on the destination. Added compile-time check to confirm FP_SIZE is compatible
 with TFM_ acceleration defines enabled. Updated comments in other places
 where excess digits are cleared.

---
 wolfcrypt/src/tfm.c     | 167 ++++++++++++++++++++++------------------
 wolfssl/wolfcrypt/tfm.h |  72 ++---------------
 2 files changed, 99 insertions(+), 140 deletions(-)

diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index be71639f6..dd1aa14b4 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -118,6 +118,8 @@ void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
   }
 
   c->used = x;
+
+  /* zero any excess digits on the destination that we didn't write to */
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
@@ -179,6 +181,8 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
      c->dp[x]  = (fp_digit)t;
      t         = (t >> DIGIT_BIT)&1;
    }
+
+  /* zero any excess digits on the destination that we didn't write to */
   for (; x < oldused; x++) {
      c->dp[x] = 0;
   }
@@ -188,7 +192,9 @@ void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
 /* c = a * b */
 void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 {
-    int   y, yy;
+    int   y, yy, oldused;
+
+    oldused = C->used;
 
     y  = MAX(A->used, B->used);
     yy = MIN(A->used, B->used);
@@ -196,7 +202,7 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
     /* call generic if we're out of range */
     if (y + yy > FP_SIZE) {
        fp_mul_comba(A, B, C);
-       return ;
+       goto clean;
     }
 
     /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
@@ -205,98 +211,104 @@ void fp_mul(fp_int *A, fp_int *B, fp_int *C)
        if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications
     */
 
-#ifdef TFM_MUL3
+#if defined(TFM_MUL3) && FP_SIZE >= 6
         if (y <= 3) {
            fp_mul_comba3(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL4
+#if defined(TFM_MUL4) && FP_SIZE >= 8
         if (y == 4) {
            fp_mul_comba4(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL6
+#if defined(TFM_MUL6) && FP_SIZE >= 12
         if (y <= 6) {
            fp_mul_comba6(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL7
+#if defined(TFM_MUL7) && FP_SIZE >= 14
         if (y == 7) {
            fp_mul_comba7(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL8
+#if defined(TFM_MUL8) && FP_SIZE >= 16
         if (y == 8) {
            fp_mul_comba8(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL9
+#if defined(TFM_MUL9) && FP_SIZE >= 18
         if (y == 9) {
            fp_mul_comba9(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL12
+#if defined(TFM_MUL12) && FP_SIZE >= 24
         if (y <= 12) {
            fp_mul_comba12(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#ifdef TFM_MUL17
+#if defined(TFM_MUL17) && FP_SIZE >= 34
         if (y <= 17) {
            fp_mul_comba17(A,B,C);
-           return;
+           goto clean;
         }
 #endif
 
-#ifdef TFM_SMALL_SET
+#if defined(TFM_SMALL_SET) && FP_SIZE >= 32
         if (y <= 16) {
            fp_mul_comba_small(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL20)
+#if defined(TFM_MUL20) && FP_SIZE >= 40
         if (y <= 20) {
            fp_mul_comba20(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL24)
+#if defined(TFM_MUL24) && FP_SIZE >= 48
         if (yy >= 16 && y <= 24) {
            fp_mul_comba24(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL28)
+#if defined(TFM_MUL28) && FP_SIZE >= 56
         if (yy >= 20 && y <= 28) {
            fp_mul_comba28(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL32)
+#if defined(TFM_MUL32) && FP_SIZE >= 64
         if (yy >= 24 && y <= 32) {
            fp_mul_comba32(A,B,C);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_MUL48)
+#if defined(TFM_MUL48) && FP_SIZE >= 96
         if (yy >= 40 && y <= 48) {
-           fp_mul_comba48(A,B,C);
-           return;
+          fp_mul_comba48(A,B,C);
+          goto clean;
         }
 #endif
-#if defined(TFM_MUL64)
+#if defined(TFM_MUL64) && FP_SIZE >= 128
         if (yy >= 56 && y <= 64) {
            fp_mul_comba64(A,B,C);
-           return;
+           goto clean;
         }
 #endif
         fp_mul_comba(A,B,C);
+
+clean:
+    /* zero any excess digits on the destination that we didn't write to */
+    for (y = C->used; y < oldused; y++) {
+        C->dp[y] = 0;
+    }
 }
 
 void fp_mul_2(fp_int * a, fp_int * b)
@@ -340,9 +352,7 @@ void fp_mul_2(fp_int * a, fp_int * b)
       ++(b->used);
     }
 
-    /* now zero any excess digits on the destination
-     * that we didn't write to
-     */
+    /* zero any excess digits on the destination that we didn't write to */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
@@ -370,6 +380,8 @@ void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
       c->dp[c->used++] = (fp_digit) w;
       ++x;
    }
+
+   /* zero any excess digits on the destination that we didn't write to */
    for (; x < oldused; x++) {
       c->dp[x] = 0;
    }
@@ -627,9 +639,7 @@ int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
   if (d != NULL) {
     fp_div_2d (&x, norm, &x, NULL);
 
-/* the following is a kludge, essentially we were seeing the right remainder but
-   with excess digits that should have been zero
- */
+    /* zero any excess digits on the destination that we didn't write to */
     for (i = b->used; i < x.used; i++) {
         x.dp[i] = 0;
     }
@@ -669,7 +679,7 @@ void fp_div_2(fp_int * a, fp_int * b)
       r = rr;
     }
 
-    /* zero excess digits */
+    /* zero any excess digits on the destination that we didn't write to */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
@@ -1267,105 +1277,114 @@ void fp_2expt(fp_int *a, int b)
 /* b = a*a  */
 void fp_sqr(fp_int *A, fp_int *B)
 {
-    int y = A->used;
+    int y, oldused;
+
+    oldused = B->used;
+    y = A->used;
 
     /* call generic if we're out of range */
     if (y + y > FP_SIZE) {
        fp_sqr_comba(A, B);
-       return ;
+       goto clean;
     }
 
-#if defined(TFM_SQR3)
+#if defined(TFM_SQR3) && FP_SIZE >= 6
         if (y <= 3) {
            fp_sqr_comba3(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR4)
+#if defined(TFM_SQR4) && FP_SIZE >= 8
         if (y == 4) {
            fp_sqr_comba4(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR6)
+#if defined(TFM_SQR6) && FP_SIZE >= 12
         if (y <= 6) {
            fp_sqr_comba6(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR7)
+#if defined(TFM_SQR7) && FP_SIZE >= 14
         if (y == 7) {
            fp_sqr_comba7(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR8)
+#if defined(TFM_SQR8) && FP_SIZE >= 16
         if (y == 8) {
            fp_sqr_comba8(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR9)
+#if defined(TFM_SQR9) && FP_SIZE >= 18
         if (y == 9) {
            fp_sqr_comba9(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR12)
+#if defined(TFM_SQR12) && FP_SIZE >= 24
         if (y <= 12) {
            fp_sqr_comba12(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR17)
+#if defined(TFM_SQR17) && FP_SIZE >= 34
         if (y <= 17) {
            fp_sqr_comba17(A,B);
-           return;
+           goto clean;
         }
 #endif
 #if defined(TFM_SMALL_SET)
         if (y <= 16) {
            fp_sqr_comba_small(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR20)
+#if defined(TFM_SQR20) && FP_SIZE >= 40
         if (y <= 20) {
            fp_sqr_comba20(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR24)
+#if defined(TFM_SQR24) && FP_SIZE >= 48
         if (y <= 24) {
            fp_sqr_comba24(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR28)
+#if defined(TFM_SQR28) && FP_SIZE >= 56
         if (y <= 28) {
            fp_sqr_comba28(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR32)
+#if defined(TFM_SQR32) && FP_SIZE >= 64
         if (y <= 32) {
            fp_sqr_comba32(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR48)
+#if defined(TFM_SQR48) && FP_SIZE >= 96
         if (y <= 48) {
            fp_sqr_comba48(A,B);
-           return;
+           goto clean;
         }
 #endif
-#if defined(TFM_SQR64)
+#if defined(TFM_SQR64) && FP_SIZE >= 128
         if (y <= 64) {
            fp_sqr_comba64(A,B);
-           return;
+           goto clean;
         }
 #endif
        fp_sqr_comba(A, B);
+
+clean:
+  /* zero any excess digits on the destination that we didn't write to */
+  for (y = B->used; y < oldused; y++) {
+    B->dp[y] = 0;
+  }
 }
 
 /* generic comba squarer */
@@ -1652,7 +1671,8 @@ static void fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
      *tmpm++ = *_c++;
   }
 
-  for (; x < oldused; x++)   {
+  /* zero any excess digits on the destination that we didn't write to */
+  for (; x < oldused; x++) {
      *tmpm++ = 0;
   }
 
@@ -1733,7 +1753,8 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
      *tmpm++ = *_c++;
   }
 
-  for (; x < oldused; x++)   {
+  /* zero any excess digits on the destination that we didn't write to */
+  for (; x < oldused; x++) {
      *tmpm++ = 0;
   }
 
diff --git a/wolfssl/wolfcrypt/tfm.h b/wolfssl/wolfcrypt/tfm.h
index c0e05e4ae..ce633b43d 100644
--- a/wolfssl/wolfcrypt/tfm.h
+++ b/wolfssl/wolfcrypt/tfm.h
@@ -211,6 +211,7 @@
 #if defined(FP_64BIT)
    /* for GCC only on supported platforms */
    typedef unsigned long long fp_digit;   /* 64bit, 128 uses mode(TI) below */
+   #define SIZEOF_FP_DIGIT 8
    typedef unsigned long      fp_word __attribute__ ((mode(TI)));
 #else
    #if defined(_MSC_VER) || defined(__BORLANDC__)
@@ -221,12 +222,14 @@
 
    #ifndef NO_64BIT
       typedef unsigned int       fp_digit;
+      #define SIZEOF_FP_DIGIT 4
       typedef ulong64            fp_word;
       #define FP_32BIT
    #else
       /* some procs like coldfire prefer not to place multiply into 64bit type
          even though it exists */
       typedef unsigned short     fp_digit;
+      #define SIZEOF_FP_DIGIT 2
       typedef unsigned int       fp_word;
    #endif
 #endif
@@ -234,7 +237,7 @@
 #endif /* WOLFSSL_BIGINT_TYPES */
 
 /* # of digits this is */
-#define DIGIT_BIT  (int)((CHAR_BIT) * sizeof(fp_digit))
+#define DIGIT_BIT   ((CHAR_BIT) * SIZEOF_FP_DIGIT)
 
 /* Max size of any number in bits.  Basically the largest size you will be
  * multiplying should be half [or smaller] of FP_MAX_SIZE-four_digit
@@ -548,103 +551,38 @@ void fp_reverse(unsigned char *s, int len);
 
 void fp_mul_comba(fp_int *a, fp_int *b, fp_int *c);
 
-#ifdef TFM_SMALL_SET
 void fp_mul_comba_small(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
-#ifdef TFM_MUL3
 void fp_mul_comba3(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL4
 void fp_mul_comba4(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL6
 void fp_mul_comba6(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL7
 void fp_mul_comba7(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL8
 void fp_mul_comba8(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL9
 void fp_mul_comba9(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL12
 void fp_mul_comba12(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL17
 void fp_mul_comba17(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
-#ifdef TFM_MUL20
 void fp_mul_comba20(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL24
 void fp_mul_comba24(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL28
 void fp_mul_comba28(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL32
 void fp_mul_comba32(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL48
 void fp_mul_comba48(fp_int *a, fp_int *b, fp_int *c);
-#endif
-#ifdef TFM_MUL64
 void fp_mul_comba64(fp_int *a, fp_int *b, fp_int *c);
-#endif
-
 void fp_sqr_comba(fp_int *a, fp_int *b);
-
-#ifdef TFM_SMALL_SET
 void fp_sqr_comba_small(fp_int *a, fp_int *b);
-#endif
-
-#ifdef TFM_SQR3
 void fp_sqr_comba3(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR4
 void fp_sqr_comba4(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR6
 void fp_sqr_comba6(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR7
 void fp_sqr_comba7(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR8
 void fp_sqr_comba8(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR9
 void fp_sqr_comba9(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR12
 void fp_sqr_comba12(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR17
 void fp_sqr_comba17(fp_int *a, fp_int *b);
-#endif
-
-#ifdef TFM_SQR20
 void fp_sqr_comba20(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR24
 void fp_sqr_comba24(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR28
 void fp_sqr_comba28(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR32
 void fp_sqr_comba32(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR48
 void fp_sqr_comba48(fp_int *a, fp_int *b);
-#endif
-#ifdef TFM_SQR64
 void fp_sqr_comba64(fp_int *a, fp_int *b);
-#endif
+
 /*extern const char *fp_s_rmap;*/
 
 

From fa5dd0100146222a43d7562fdb2c600f481eaecf Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:20:03 -0700
Subject: [PATCH 05/16] Fixes/improvements to the wolfCrypt ECC tests. Fixed
 bug with sharedA/sharedB being too small when BENCH_EMBEDDED is used and
 curve size over 256 bit. Added error message for ECC test failures, to show
 the curve size used. Fix to wc_ecc_verify_hash test to use digest that is not
 all zeros as that doesn't work correctly for non-Shamir ECC math. Changed
 return code for wc_ecc_check_check so its unique.

---
 wolfcrypt/test/test.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index 32da38dc7..d5d114c18 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -6560,7 +6560,7 @@ static int ecc_test_key_gen(WC_RNG* rng, int keySize)
 
     ret = wc_ecc_check_key(&userA);
     if (ret != 0)
-        return -1024;
+        return -1023;
 
     derSz = wc_EccKeyToDer(&userA, der, FOURK_BUF);
     if (derSz < 0) {
@@ -6621,8 +6621,8 @@ static int ecc_test_curve_size(WC_RNG* rng, int keySize, int testVerifyCount,
     int testCompressedKey)
 {
 #ifdef BENCH_EMBEDDED
-    byte    sharedA[32];
-    byte    sharedB[32];
+    byte    sharedA[128]; /* Needs to be at least keySize */
+    byte    sharedB[128]; /* Needs to be at least keySize */
 #else
     byte    sharedA[1024];
     byte    sharedB[1024];
@@ -6652,7 +6652,7 @@ static int ecc_test_curve_size(WC_RNG* rng, int keySize, int testVerifyCount,
 
     ret = wc_ecc_check_key(&userA);
     if (ret != 0)
-        return -1024;
+        return -1023;
 
     ret = wc_ecc_make_key(rng, keySize, &userB);
     if (ret != 0)
@@ -6732,6 +6732,7 @@ static int ecc_test_curve_size(WC_RNG* rng, int keySize, int testVerifyCount,
     for (i = 0; i < (int)sizeof(digest); i++) {
         digest[i] = 0;
     }
+    digest[i-1] = 1; /* Set last digit to non-zero value */
 
     x = sizeof(sig);
     ret = wc_ecc_sign_hash(digest, sizeof(digest), sig, &x, rng, &userA);
@@ -6801,12 +6802,14 @@ static int ecc_test_curve(WC_RNG* rng, int keySize)
     ret = ecc_test_curve_size(rng, keySize, ECC_TEST_VERIFY_COUNT,
                                                         testCompressedKey);
     if (ret < 0) {
+        printf("ecc_test_curve_size %d failed!: %d\n", keySize, ret);
         return ret;
     }
 
     #ifdef HAVE_ECC_VECTOR_TEST
         ret = ecc_test_vector(keySize);
         if (ret < 0) {
+            printf("ecc_test_vector %d failed!: %d\n", keySize, ret);
             return ret;
         }
     #endif
@@ -6814,6 +6817,7 @@ static int ecc_test_curve(WC_RNG* rng, int keySize)
     #ifdef WOLFSSL_KEY_GEN
         ret = ecc_test_key_gen(rng, keySize);
         if (ret < 0) {
+            printf("ecc_test_key_gen %d failed!: %d\n", keySize, ret);
             return ret;
         }
     #endif

From 0ddbe0e60e19e7752108e59b58ad3c70c404e6dc Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:20:55 -0700
Subject: [PATCH 06/16] Enhancement to RSA math function "_fp_exptmod" (non
 timing resistant version) to support WOLFSSL_SMALL_STACK, which moves the
 allocation of the 64 fp_int's from the stack to the heap.

---
 wolfcrypt/src/tfm.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index dd1aa14b4..c3a72c9e2 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -1059,9 +1059,14 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
  */
 static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
-  fp_int   M[64], res;
+  fp_int   res;
   fp_digit buf, mp;
   int      err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+#ifdef WOLFSSL_SMALL_STACK
+  fp_int  *M;
+#else
+  fp_int   M[64];
+#endif
 
   /* find window size */
   x = fp_count_bits (X);
@@ -1077,15 +1082,23 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
     winsize = 6;
   }
 
-  /* init M array */
-  for(x = 0; x < (int)(sizeof(M)/sizeof(fp_int)); x++)
-    fp_init(&M[x]);
-
   /* now setup montgomery  */
   if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
      return err;
   }
 
+#ifdef WOLFSSL_SMALL_STACK
+  /* only allocate space for what's needed */
+  M = (fp_int*)XMALLOC(sizeof(fp_int)*(1 << winsize), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (M == NULL) {
+     return FP_MEM;
+  }
+#endif
+
+  /* init M array */
+  for(x = 0; x < (1 << winsize); x++)
+    fp_init(&M[x]);
+
   /* setup result */
   fp_init(&res);
 
@@ -1093,7 +1106,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
    *
    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
    *
-   * The first half of the table is not computed though accept for M[0] and M[1]
+   * The first half of the table is not computed though except for M[0] and M[1]
    */
 
    /* now we need R mod m */
@@ -1212,10 +1225,15 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 
   /* swap res with Y */
   fp_copy (&res, Y);
+
+#ifdef WOLFSSL_SMALL_STACK
+  XFREE(M, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
   return FP_OKAY;
 }
 
-#endif
+#endif /* TFM_TIMING_RESISTANT */
 
 int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {

From a4782fcf0105b323c58f53c112d52d90f1773728 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:22:14 -0700
Subject: [PATCH 07/16] Fix in fast math version of ecc_projective_dbl_point to
 use a local for x,y,z since ecc_point fp_int's are reduced size and cause
 math issues with ALT_ECC_SIZE enabled. Added local stack variable cleanups
 for ecc_projective_add_point.

---
 wolfcrypt/src/ecc.c | 112 +++++++++++++++++++++++++++-----------------
 1 file changed, 69 insertions(+), 43 deletions(-)

diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c
index f3a47793b..1465a8752 100644
--- a/wolfcrypt/src/ecc.c
+++ b/wolfcrypt/src/ecc.c
@@ -298,7 +298,14 @@ int ecc_projective_add_point(ecc_point *P, ecc_point *Q, ecc_point *R,
    fp_sub(modulus, Q->y, &t1);
    if ( (fp_cmp(P->x, Q->x) == FP_EQ) &&
         (get_digit_count(Q->z) && fp_cmp(P->z, Q->z) == FP_EQ) &&
-        (fp_cmp(P->y, Q->y) == FP_EQ || fp_cmp(P->y, &t1) == FP_EQ)) {
+        (fp_cmp(P->y, Q->y) == FP_EQ || fp_cmp(P->y, &t1) == FP_EQ))
+   {
+        fp_clear(&x);
+        fp_clear(&y);
+        fp_clear(&z);
+        fp_clear(&t1);
+        fp_clear(&t2);
+
         return ecc_projective_dbl_point(P, R, modulus, mp);
    }
 
@@ -423,10 +430,18 @@ int ecc_projective_add_point(ecc_point *P, ecc_point *Q, ecc_point *R,
    }
    fp_div_2(&y, &y);
 
+   /* return result */
    fp_copy(&x, R->x);
    fp_copy(&y, R->y);
    fp_copy(&z, R->z);
 
+   /* clear stack variables */
+   fp_clear(&x);
+   fp_clear(&y);
+   fp_clear(&z);
+   fp_clear(&t1);
+   fp_clear(&t2);
+
    return MP_OKAY;
 }
 
@@ -442,41 +457,40 @@ int ecc_projective_add_point(ecc_point *P, ecc_point *Q, ecc_point *R,
 int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
                              mp_digit* mp)
 {
-   fp_int   t1, t2;
+   fp_int   x, y, z, t1, t2;
    int      err;
 
    if (P == NULL || R == NULL || modulus == NULL || mp == NULL)
        return ECC_BAD_ARG_E;
 
-   if (P != R) {
-      fp_copy(P->x, R->x);
-      fp_copy(P->y, R->y);
-      fp_copy(P->z, R->z);
-   }
-
-   if ((err = mp_init_multi(&t1, &t2, NULL, NULL, NULL, NULL)) != MP_OKAY) {
+   if ((err = mp_init_multi(&x, &y, &z, &t1, &t2, NULL)) != MP_OKAY) {
       return err;
    }
 
-   /* t1 = Z * Z */
-   fp_sqr(R->z, &t1);
+   /* Use local due to possible insufficient size of alt_ecc_size in ecc_point x,y,z */
+   fp_copy(P->x, &x);
+   fp_copy(P->y, &y);
+   fp_copy(P->z, &z);
+
+   /* T1 = Z * Z */
+   fp_sqr(&z, &t1);
    fp_montgomery_reduce(&t1, modulus, *mp);
    /* Z = Y * Z */
-   fp_mul(R->z, R->y, R->z);
-   fp_montgomery_reduce(R->z, modulus, *mp);
+   fp_mul(&z, &y, &z);
+   fp_montgomery_reduce(&z, modulus, *mp);
    /* Z = 2Z */
-   fp_add(R->z, R->z, R->z);
-   if (fp_cmp(R->z, modulus) != FP_LT) {
-      fp_sub(R->z, modulus, R->z);
+   fp_add(&z, &z, &z);
+   if (fp_cmp(&z, modulus) != FP_LT) {
+      fp_sub(&z, modulus, &z);
    }
 
-   /* &t2 = X - T1 */
-   fp_sub(R->x, &t1, &t2);
+   /* T2 = X - T1 */
+   fp_sub(&x, &t1, &t2);
    if (fp_cmp_d(&t2, 0) == FP_LT) {
       fp_add(&t2, modulus, &t2);
    }
    /* T1 = X + T1 */
-   fp_add(&t1, R->x, &t1);
+   fp_add(&t1, &x, &t1);
    if (fp_cmp(&t1, modulus) != FP_LT) {
       fp_sub(&t1, modulus, &t1);
    }
@@ -495,15 +509,15 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
    }
 
    /* Y = 2Y */
-   fp_add(R->y, R->y, R->y);
-   if (fp_cmp(R->y, modulus) != FP_LT) {
-      fp_sub(R->y, modulus, R->y);
+   fp_add(&y, &y, &y);
+   if (fp_cmp(&y, modulus) != FP_LT) {
+      fp_sub(&y, modulus, &y);
    }
    /* Y = Y * Y */
-   fp_sqr(R->y, R->y);
-   fp_montgomery_reduce(R->y, modulus, *mp);
+   fp_sqr(&y, &y);
+   fp_montgomery_reduce(&y, modulus, *mp);
    /* T2 = Y * Y */
-   fp_sqr(R->y, &t2);
+   fp_sqr(&y, &t2);
    fp_montgomery_reduce(&t2, modulus, *mp);
    /* T2 = T2/2 */
    if (fp_isodd(&t2)) {
@@ -511,37 +525,49 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
    }
    fp_div_2(&t2, &t2);
    /* Y = Y * X */
-   fp_mul(R->y, R->x, R->y);
-   fp_montgomery_reduce(R->y, modulus, *mp);
+   fp_mul(&y, &x, &y);
+   fp_montgomery_reduce(&y, modulus, *mp);
 
    /* X  = T1 * T1 */
-   fp_sqr(&t1, R->x);
-   fp_montgomery_reduce(R->x, modulus, *mp);
+   fp_sqr(&t1, &x);
+   fp_montgomery_reduce(&x, modulus, *mp);
    /* X = X - Y */
-   fp_sub(R->x, R->y, R->x);
-   if (fp_cmp_d(R->x, 0) == FP_LT) {
-      fp_add(R->x, modulus, R->x);
+   fp_sub(&x, &y, &x);
+   if (fp_cmp_d(&x, 0) == FP_LT) {
+      fp_add(&x, modulus, &x);
    }
    /* X = X - Y */
-   fp_sub(R->x, R->y, R->x);
-   if (fp_cmp_d(R->x, 0) == FP_LT) {
-      fp_add(R->x, modulus, R->x);
+   fp_sub(&x, &y, &x);
+   if (fp_cmp_d(&x, 0) == FP_LT) {
+      fp_add(&x, modulus, &x);
    }
 
    /* Y = Y - X */
-   fp_sub(R->y, R->x, R->y);
-   if (fp_cmp_d(R->y, 0) == FP_LT) {
-      fp_add(R->y, modulus, R->y);
+   fp_sub(&y, &x, &y);
+   if (fp_cmp_d(&y, 0) == FP_LT) {
+      fp_add(&y, modulus, &y);
    }
    /* Y = Y * T1 */
-   fp_mul(R->y, &t1, R->y);
-   fp_montgomery_reduce(R->y, modulus, *mp);
+   fp_mul(&y, &t1, &y);
+   fp_montgomery_reduce(&y, modulus, *mp);
    /* Y = Y - T2 */
-   fp_sub(R->y, &t2, R->y);
-   if (fp_cmp_d(R->y, 0) == FP_LT) {
-      fp_add(R->y, modulus, R->y);
+   fp_sub(&y, &t2, &y);
+   if (fp_cmp_d(&y, 0) == FP_LT) {
+      fp_add(&y, modulus, &y);
    }
 
+   /* Return x, y, and z */
+   fp_copy(&x, R->x);
+   fp_copy(&y, R->y);
+   fp_copy(&z, R->z);
+
+   /* Clear used locals */
+   fp_clear(&x);
+   fp_clear(&y);
+   fp_clear(&z);
+   fp_clear(&t1);
+   fp_clear(&t2);
+
    return MP_OKAY;
 }
 

From 5cbc4bdf2967692ac971d52dad50a4bc228c517c Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:23:04 -0700
Subject: [PATCH 08/16] Added new "WOLFSSL_DEBUG_MATH", which enables use of
 "mp_dump" to display information about an mp_int.

---
 wolfcrypt/src/integer.c     | 38 ++++++++++++++++++++++++++++++++--
 wolfcrypt/src/tfm.c         | 41 ++++++++++++++++++++++++++++++++-----
 wolfssl/wolfcrypt/integer.h |  6 ++++++
 wolfssl/wolfcrypt/tfm.h     |  6 ++++++
 4 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/wolfcrypt/src/integer.c b/wolfcrypt/src/integer.c
index 9e9b3d01e..045effb9f 100644
--- a/wolfcrypt/src/integer.c
+++ b/wolfcrypt/src/integer.c
@@ -40,6 +40,10 @@
 
 #include <wolfssl/wolfcrypt/integer.h>
 
+#ifdef WOLFSSL_DEBUG_MATH
+    #include <stdio.h>
+#endif
+
 #ifndef NO_WOLFSSL_SMALL_STACK
     #ifndef WOLFSSL_SMALL_STACK
         #define WOLFSSL_SMALL_STACK
@@ -4628,7 +4632,8 @@ int mp_read_radix (mp_int * a, const char *str, int radix)
 }
 #endif /* HAVE_ECC */
 
-#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY)
+#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
+    defined(WOLFSSL_DEBUG_MATH)
 
 /* returns size of ASCII representation */
 int mp_radix_size (mp_int *a, int radix, int *size)
@@ -4739,7 +4744,36 @@ int mp_toradix (mp_int *a, char *str, int radix)
     return MP_OKAY;
 }
 
-#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) */
+#ifdef WOLFSSL_DEBUG_MATH
+void mp_dump(const char* desc, mp_int* a, byte verbose)
+{
+  char *buffer;
+  int size = a->alloc;
+
+  buffer = (char*)XMALLOC(size * 2, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (buffer == NULL) {
+    return;
+  }
+
+  printf("%s: ptr=%p, used=%d, sign=%d, size=%d, mpd=%d\n",
+    desc, a, a->used, a->sign, size, (int)sizeof(mp_digit));
+
+  mp_toradix(a, buffer, 16);
+  printf("  %s\n  ", buffer);
+
+  if (verbose) {
+    int i;
+    for(i=0; i<a->alloc * (int)sizeof(mp_digit); i++) {
+      printf("%02x ", *(((byte*)a->dp) + i));
+    }
+    printf("\n");
+  }
+
+  XFREE(buffer, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+}
+#endif /* WOLFSSL_DEBUG_MATH */
+
+#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || defined(WOLFSSL_DEBUG_MATH) */
 
 #endif /* USE_FAST_MATH */
 
diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index c3a72c9e2..5f8e7df2d 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -50,6 +50,10 @@
 #include <wolfssl/wolfcrypt/tfm.h>
 #include <wolfcrypt/src/asm.c>  /* will define asm MACROS or C ones */
 
+#ifdef WOLFSSL_DEBUG_MATH
+    #include <stdio.h>
+#endif
+
 
 /* math settings check */
 word32 CheckRunTimeSettings(void)
@@ -2328,7 +2332,8 @@ int mp_montgomery_calc_normalization(mp_int *a, mp_int *b)
 #endif /* WOLFSSL_KEYGEN || HAVE_ECC */
 
 
-#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY)
+#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
+    defined(WOLFSSL_DEBUG_MATH)
 
 static const int lnz[16] = {
    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
@@ -2475,7 +2480,7 @@ int mp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
    return fp_mod_d(a, b, c);
 }
 
-#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) */
+#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || defined(WOLFSSL_DEBUG_MATH) */
 
 #ifdef WOLFSSL_KEY_GEN
 
@@ -2918,7 +2923,8 @@ int mp_cnt_lsb(fp_int* a)
 
 #endif /* HAVE_ECC */
 
-#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY)
+#if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
+    defined(WOLFSSL_DEBUG_MATH)
 
 /* returns size of ASCII representation */
 int mp_radix_size (mp_int *a, int radix, int *size)
@@ -3026,7 +3032,32 @@ int mp_toradix (mp_int *a, char *str, int radix)
     return FP_OKAY;
 }
 
-#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) */
+#ifdef WOLFSSL_DEBUG_MATH
+void mp_dump(const char* desc, mp_int* a, byte verbose)
+{
+  char buffer[FP_SIZE * sizeof(fp_digit) * 2];
+  int size = FP_SIZE;
+
+#ifdef ALT_ECC_SIZE
+  size = a->size;
+#endif
+
+  printf("%s: ptr=%p, used=%d, sign=%d, size=%d, fpd=%d\n",
+    desc, a, a->used, a->sign, size, (int)sizeof(fp_digit));
+
+  mp_toradix(a, buffer, 16);
+  printf("  %s\n  ", buffer);
+
+  if (verbose) {
+    int i;
+    for(i=0; i<size * (int)sizeof(fp_digit); i++) {
+      printf("%x ", *(((byte*)a->dp) + i));
+    }
+    printf("\n");
+  }
+}
+#endif /* WOLFSSL_DEBUG_MATH */
+
+#endif /* defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || defined(WOLFSSL_DEBUG_MATH) */
 
 #endif /* USE_FAST_MATH */
-
diff --git a/wolfssl/wolfcrypt/integer.h b/wolfssl/wolfcrypt/integer.h
index a0ca3c15e..2b38601cb 100644
--- a/wolfssl/wolfcrypt/integer.h
+++ b/wolfssl/wolfcrypt/integer.h
@@ -310,6 +310,12 @@ int mp_init_multi(mp_int* a, mp_int* b, mp_int* c, mp_int* d, mp_int* e,
 int mp_toradix (mp_int *a, char *str, int radix);
 int mp_radix_size (mp_int * a, int radix, int *size);
 
+#ifdef WOLFSSL_DEBUG_MATH
+    void mp_dump(const char* desc, mp_int* a, byte verbose);
+#else
+    #define mp_dump(desc, a, verbose)
+#endif
+
 #if defined(HAVE_ECC) || defined(WOLFSSL_KEY_GEN)
     int mp_sqrmod(mp_int* a, mp_int* b, mp_int* c);
 #endif
diff --git a/wolfssl/wolfcrypt/tfm.h b/wolfssl/wolfcrypt/tfm.h
index ce633b43d..f86a7e52f 100644
--- a/wolfssl/wolfcrypt/tfm.h
+++ b/wolfssl/wolfcrypt/tfm.h
@@ -645,6 +645,12 @@ void mp_rshb(mp_int *a, int x);
 int mp_toradix (mp_int *a, char *str, int radix);
 int mp_radix_size (mp_int * a, int radix, int *size);
 
+#ifdef WOLFSSL_DEBUG_MATH
+    void mp_dump(const char* desc, mp_int* a, byte verbose);
+#else
+    #define mp_dump(desc, a, verbose)
+#endif
+
 #ifdef HAVE_ECC
     int mp_read_radix(mp_int* a, const char* str, int radix);
     void mp_set(fp_int *a, fp_digit b);

From 1b602d783c4b46c88be364a13afe707339413265 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Wed, 4 May 2016 23:24:33 -0700
Subject: [PATCH 09/16] Fast math correction of "sizeof" to use (). Updates to
 tfm and ecc comments.

---
 wolfcrypt/src/ecc.c |  2 +-
 wolfcrypt/src/tfm.c | 20 +++++---------------
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c
index 1465a8752..2952e5c2d 100644
--- a/wolfcrypt/src/ecc.c
+++ b/wolfcrypt/src/ecc.c
@@ -2788,7 +2788,7 @@ int wc_ecc_export_x963_ex(ecc_key* key, byte* out, word32* outLen,
 }
 #endif /* HAVE_ECC_KEY_EXPORT */
 
-/* is ec point on curve described by dp ? */
+/* is ecc point on curve described by dp ? */
 static int ecc_is_point(const ecc_set_type* dp, ecc_point* ecp, mp_int* prime)
 {
    mp_int b, t1, t2;
diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index 5f8e7df2d..258e31e7d 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -1554,7 +1554,7 @@ int fp_cmp_mag(fp_int *a, fp_int *b)
    return FP_EQ;
 }
 
-/* setups the montgomery reduction */
+/* sets up the montgomery reduction */
 int fp_montgomery_setup(fp_int *a, fp_digit *rho)
 {
   fp_digit x, b;
@@ -1653,7 +1653,7 @@ static void fp_montgomery_reduce_mulx(fp_int *a, fp_int *m, fp_digit mp)
 
 
    /* now zero the buff */
-   XMEMSET(c, 0, sizeof c);
+   XMEMSET(c, 0, sizeof(c));
    pa = m->used;
 
    /* copy the input */
@@ -1733,7 +1733,7 @@ void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
 
 
    /* now zero the buff */
-   XMEMSET(c, 0, sizeof c);
+   XMEMSET(c, 0, sizeof(c));
    pa = m->used;
 
    /* copy the input */
@@ -1872,7 +1872,7 @@ void fp_set(fp_int *a, fp_digit b)
    a->used  = a->dp[0] ? 1 : 0;
 }
 
-/* chek if a bit is set */
+/* check if a bit is set */
 int fp_is_bit_set (fp_int *a, fp_digit b)
 {
     fp_digit i;
@@ -2246,49 +2246,39 @@ void fp_init_copy(fp_int *a, fp_int* b)
 }
 #endif
 
-/* fast math conversion */
+/* fast math wrappers */
 int mp_copy(fp_int* a, fp_int* b)
 {
     fp_copy(a, b);
     return MP_OKAY;
 }
 
-
-/* fast math conversion */
 int mp_isodd(mp_int* a)
 {
     return fp_isodd(a);
 }
 
-
-/* fast math conversion */
 int mp_iszero(mp_int* a)
 {
     return fp_iszero(a);
 }
 
 
-/* fast math conversion */
 int mp_count_bits (mp_int* a)
 {
     return fp_count_bits(a);
 }
 
-
 int mp_leading_bit (mp_int* a)
 {
     return fp_leading_bit(a);
 }
 
-
-/* fast math conversion */
 void mp_rshb (mp_int* a, int x)
 {
     fp_rshb(a, x);
 }
 
-
-/* fast math wrappers */
 int mp_set_int(mp_int *a, mp_digit b)
 {
     fp_set(a, b);

From fe58db2a07a23641a8c2c183095c044784b53ffa Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Thu, 5 May 2016 12:24:08 -0700
Subject: [PATCH 10/16] Fixed typo with new "eccshamir" configure option.

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 64c4231c8..3f496a7d7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -732,7 +732,7 @@ ECC_DEFAULT=yes
 fi
 
 # ECC Shamir
-AC_ARG_ENABLE([ecc],
+AC_ARG_ENABLE([eccshamir],
     [AS_HELP_STRING([--enable-eccshamir],[Enable ECC Shamir (default: enabled on x86_64)])],
     [ ENABLED_ECC_SHAMIR=$enableval ],
     [ ENABLED_ECC_SHAMIR=$ECC_DEFAULT ]

From 880b2e454bbbbb67ddab921d3fd908e124c4bffb Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Thu, 5 May 2016 19:00:50 -0700
Subject: [PATCH 11/16] Refactor of the ecc_projective_add_point and
 ecc_projective_dbl_point functions to eliminate duplicate versions. Modified
 new single functions to work with normal, fast and alt_ecc_size math options.
 Careful use of mp_clear to retain original performance.

---
 wolfcrypt/src/ecc.c | 554 ++++++++++++--------------------------------
 1 file changed, 150 insertions(+), 404 deletions(-)

diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c
index 2952e5c2d..cf5989344 100644
--- a/wolfcrypt/src/ecc.c
+++ b/wolfcrypt/src/ecc.c
@@ -268,311 +268,6 @@ static mp_digit get_digit(mp_int* a, int n)
 }
 
 
-#if defined(USE_FAST_MATH)
-
-/* fast math accelerated version, but not for fp ecc yet */
-
-/**
-   Add two ECC points
-   P        The point to add
-   Q        The point to add
-   R        [out] The destination of the double
-   modulus  The modulus of the field the ECC curve is in
-   mp       The "b" value from montgomery_setup()
-   return   MP_OKAY on success
-*/
-int ecc_projective_add_point(ecc_point *P, ecc_point *Q, ecc_point *R,
-                             mp_int* modulus, mp_digit* mp)
-{
-   fp_int t1, t2, x, y, z;
-   int    err;
-
-   if (P == NULL || Q == NULL || R == NULL || modulus == NULL || mp == NULL)
-       return ECC_BAD_ARG_E;
-
-   if ((err = mp_init_multi(&t1, &t2, &x, &y, &z, NULL)) != MP_OKAY) {
-      return err;
-   }
-
-   /* should we dbl instead? */
-   fp_sub(modulus, Q->y, &t1);
-   if ( (fp_cmp(P->x, Q->x) == FP_EQ) &&
-        (get_digit_count(Q->z) && fp_cmp(P->z, Q->z) == FP_EQ) &&
-        (fp_cmp(P->y, Q->y) == FP_EQ || fp_cmp(P->y, &t1) == FP_EQ))
-   {
-        fp_clear(&x);
-        fp_clear(&y);
-        fp_clear(&z);
-        fp_clear(&t1);
-        fp_clear(&t2);
-
-        return ecc_projective_dbl_point(P, R, modulus, mp);
-   }
-
-   fp_copy(P->x, &x);
-   fp_copy(P->y, &y);
-   fp_copy(P->z, &z);
-
-   /* if Z is one then these are no-operations */
-   if (get_digit_count(Q->z)) {
-      /* T1 = Z' * Z' */
-      fp_sqr(Q->z, &t1);
-      fp_montgomery_reduce(&t1, modulus, *mp);
-      /* X = X * T1 */
-      fp_mul(&t1, &x, &x);
-      fp_montgomery_reduce(&x, modulus, *mp);
-      /* T1 = Z' * T1 */
-      fp_mul(Q->z, &t1, &t1);
-      fp_montgomery_reduce(&t1, modulus, *mp);
-      /* Y = Y * T1 */
-      fp_mul(&t1, &y, &y);
-      fp_montgomery_reduce(&y, modulus, *mp);
-   }
-
-   /* T1 = Z*Z */
-   fp_sqr(&z, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-   /* T2 = X' * T1 */
-   fp_mul(Q->x, &t1, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* T1 = Z * T1 */
-   fp_mul(&z, &t1, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-   /* T1 = Y' * T1 */
-   fp_mul(Q->y, &t1, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-
-   /* Y = Y - T1 */
-   fp_sub(&y, &t1, &y);
-   if (fp_cmp_d(&y, 0) == FP_LT) {
-      fp_add(&y, modulus, &y);
-   }
-   /* T1 = 2T1 */
-   fp_add(&t1, &t1, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-   /* T1 = Y + T1 */
-   fp_add(&t1, &y, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-   /* X = X - T2 */
-   fp_sub(&x, &t2, &x);
-   if (fp_cmp_d(&x, 0) == FP_LT) {
-      fp_add(&x, modulus, &x);
-   }
-   /* T2 = 2T2 */
-   fp_add(&t2, &t2, &t2);
-   if (fp_cmp(&t2, modulus) != FP_LT) {
-      fp_sub(&t2, modulus, &t2);
-   }
-   /* T2 = X + T2 */
-   fp_add(&t2, &x, &t2);
-   if (fp_cmp(&t2, modulus) != FP_LT) {
-      fp_sub(&t2, modulus, &t2);
-   }
-
-   /* if Z' != 1 */
-   if (get_digit_count(Q->z)) {
-      /* Z = Z * Z' */
-      fp_mul(&z, Q->z, &z);
-      fp_montgomery_reduce(&z, modulus, *mp);
-   }
-
-   /* Z = Z * X */
-   fp_mul(&z, &x, &z);
-   fp_montgomery_reduce(&z, modulus, *mp);
-
-   /* T1 = T1 * X  */
-   fp_mul(&t1, &x, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-   /* X = X * X */
-   fp_sqr(&x, &x);
-   fp_montgomery_reduce(&x, modulus, *mp);
-   /* T2 = T2 * x */
-   fp_mul(&t2, &x, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* T1 = T1 * X  */
-   fp_mul(&t1, &x, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-
-   /* X = Y*Y */
-   fp_sqr(&y, &x);
-   fp_montgomery_reduce(&x, modulus, *mp);
-   /* X = X - T2 */
-   fp_sub(&x, &t2, &x);
-   if (fp_cmp_d(&x, 0) == FP_LT) {
-      fp_add(&x, modulus, &x);
-   }
-
-   /* T2 = T2 - X */
-   fp_sub(&t2, &x, &t2);
-   if (fp_cmp_d(&t2, 0) == FP_LT) {
-      fp_add(&t2, modulus, &t2);
-   }
-   /* T2 = T2 - X */
-   fp_sub(&t2, &x, &t2);
-   if (fp_cmp_d(&t2, 0) == FP_LT) {
-      fp_add(&t2, modulus, &t2);
-   }
-   /* T2 = T2 * Y */
-   fp_mul(&t2, &y, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* Y = T2 - T1 */
-   fp_sub(&t2, &t1, &y);
-   if (fp_cmp_d(&y, 0) == FP_LT) {
-      fp_add(&y, modulus, &y);
-   }
-   /* Y = Y/2 */
-   if (fp_isodd(&y)) {
-      fp_add(&y, modulus, &y);
-   }
-   fp_div_2(&y, &y);
-
-   /* return result */
-   fp_copy(&x, R->x);
-   fp_copy(&y, R->y);
-   fp_copy(&z, R->z);
-
-   /* clear stack variables */
-   fp_clear(&x);
-   fp_clear(&y);
-   fp_clear(&z);
-   fp_clear(&t1);
-   fp_clear(&t2);
-
-   return MP_OKAY;
-}
-
-
-/**
-   Double an ECC point
-   P   The point to double
-   R   [out] The destination of the double
-   modulus  The modulus of the field the ECC curve is in
-   mp       The "b" value from montgomery_setup()
-   return   MP_OKAY on success
-*/
-int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
-                             mp_digit* mp)
-{
-   fp_int   x, y, z, t1, t2;
-   int      err;
-
-   if (P == NULL || R == NULL || modulus == NULL || mp == NULL)
-       return ECC_BAD_ARG_E;
-
-   if ((err = mp_init_multi(&x, &y, &z, &t1, &t2, NULL)) != MP_OKAY) {
-      return err;
-   }
-
-   /* Use local due to possible insufficient size of alt_ecc_size in ecc_point x,y,z */
-   fp_copy(P->x, &x);
-   fp_copy(P->y, &y);
-   fp_copy(P->z, &z);
-
-   /* T1 = Z * Z */
-   fp_sqr(&z, &t1);
-   fp_montgomery_reduce(&t1, modulus, *mp);
-   /* Z = Y * Z */
-   fp_mul(&z, &y, &z);
-   fp_montgomery_reduce(&z, modulus, *mp);
-   /* Z = 2Z */
-   fp_add(&z, &z, &z);
-   if (fp_cmp(&z, modulus) != FP_LT) {
-      fp_sub(&z, modulus, &z);
-   }
-
-   /* T2 = X - T1 */
-   fp_sub(&x, &t1, &t2);
-   if (fp_cmp_d(&t2, 0) == FP_LT) {
-      fp_add(&t2, modulus, &t2);
-   }
-   /* T1 = X + T1 */
-   fp_add(&t1, &x, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-   /* T2 = T1 * T2 */
-   fp_mul(&t1, &t2, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* T1 = 2T2 */
-   fp_add(&t2, &t2, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-   /* T1 = T1 + T2 */
-   fp_add(&t1, &t2, &t1);
-   if (fp_cmp(&t1, modulus) != FP_LT) {
-      fp_sub(&t1, modulus, &t1);
-   }
-
-   /* Y = 2Y */
-   fp_add(&y, &y, &y);
-   if (fp_cmp(&y, modulus) != FP_LT) {
-      fp_sub(&y, modulus, &y);
-   }
-   /* Y = Y * Y */
-   fp_sqr(&y, &y);
-   fp_montgomery_reduce(&y, modulus, *mp);
-   /* T2 = Y * Y */
-   fp_sqr(&y, &t2);
-   fp_montgomery_reduce(&t2, modulus, *mp);
-   /* T2 = T2/2 */
-   if (fp_isodd(&t2)) {
-      fp_add(&t2, modulus, &t2);
-   }
-   fp_div_2(&t2, &t2);
-   /* Y = Y * X */
-   fp_mul(&y, &x, &y);
-   fp_montgomery_reduce(&y, modulus, *mp);
-
-   /* X  = T1 * T1 */
-   fp_sqr(&t1, &x);
-   fp_montgomery_reduce(&x, modulus, *mp);
-   /* X = X - Y */
-   fp_sub(&x, &y, &x);
-   if (fp_cmp_d(&x, 0) == FP_LT) {
-      fp_add(&x, modulus, &x);
-   }
-   /* X = X - Y */
-   fp_sub(&x, &y, &x);
-   if (fp_cmp_d(&x, 0) == FP_LT) {
-      fp_add(&x, modulus, &x);
-   }
-
-   /* Y = Y - X */
-   fp_sub(&y, &x, &y);
-   if (fp_cmp_d(&y, 0) == FP_LT) {
-      fp_add(&y, modulus, &y);
-   }
-   /* Y = Y * T1 */
-   fp_mul(&y, &t1, &y);
-   fp_montgomery_reduce(&y, modulus, *mp);
-   /* Y = Y - T2 */
-   fp_sub(&y, &t2, &y);
-   if (fp_cmp_d(&y, 0) == FP_LT) {
-      fp_add(&y, modulus, &y);
-   }
-
-   /* Return x, y, and z */
-   fp_copy(&x, R->x);
-   fp_copy(&y, R->y);
-   fp_copy(&z, R->z);
-
-   /* Clear used locals */
-   fp_clear(&x);
-   fp_clear(&y);
-   fp_clear(&z);
-   fp_clear(&t1);
-   fp_clear(&t2);
-
-   return MP_OKAY;
-}
-
-#else /* USE_FAST_MATH */
-
 /**
    Add two ECC points
    P        The point to add
@@ -585,43 +280,63 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
 int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
                              mp_int* modulus, mp_digit* mp)
 {
-   mp_int t1;
-   mp_int t2;
-   mp_int x;
-   mp_int y;
-   mp_int z;
+   mp_int t1, t2;
+#ifdef ALT_ECC_SIZE
+   mp_int rx, ry, rz;
+#endif
+   mp_int *x, *y, *z;
    int    err;
 
    if (P == NULL || Q == NULL || R == NULL || modulus == NULL || mp == NULL)
        return ECC_BAD_ARG_E;
 
-   if ((err = mp_init_multi(&t1, &t2, &x, &y, &z, NULL)) != MP_OKAY) {
+   if ((err = mp_init_multi(&t1, &t2, NULL, NULL, NULL, NULL)) != MP_OKAY) {
       return err;
    }
 
    /* should we dbl instead? */
-   err = mp_sub(modulus, Q->y, &t1);
-
+   if (err == MP_OKAY)
+       err = mp_sub(modulus, Q->y, &t1);
    if (err == MP_OKAY) {
        if ( (mp_cmp(P->x, Q->x) == MP_EQ) &&
             (get_digit_count(Q->z) && mp_cmp(P->z, Q->z) == MP_EQ) &&
             (mp_cmp(P->y, Q->y) == MP_EQ || mp_cmp(P->y, &t1) == MP_EQ)) {
                 mp_clear(&t1);
                 mp_clear(&t2);
-                mp_clear(&x);
-                mp_clear(&y);
-                mp_clear(&z);
-
                 return ecc_projective_dbl_point(P, R, modulus, mp);
        }
    }
+   
+   if (err != MP_OKAY) {
+      mp_clear(&t1);
+      mp_clear(&t2);
+      return err;
+   }
+
+#ifdef ALT_ECC_SIZE
+   /* Use local stack variable */
+   x = &rx;
+   y = &ry;
+   z = &rz;
+
+   if ((err = mp_init_multi(x, y, z, NULL, NULL, NULL)) != MP_OKAY) {
+      mp_clear(&t1);
+      mp_clear(&t2);
+      return err;
+   }
+#else
+   /* Use destination directly */
+   x = R->x;
+   y = R->y;
+   z = R->z;
+#endif
 
    if (err == MP_OKAY)
-       err = mp_copy(P->x, &x);
+       err = mp_copy(P->x, x);
    if (err == MP_OKAY)
-       err = mp_copy(P->y, &y);
+       err = mp_copy(P->y, y);
    if (err == MP_OKAY)
-       err = mp_copy(P->z, &z);
+       err = mp_copy(P->z, z);
 
    /* if Z is one then these are no-operations */
    if (err == MP_OKAY) {
@@ -633,9 +348,9 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 
            /* X = X * T1 */
            if (err == MP_OKAY)
-               err = mp_mul(&t1, &x, &x);
+               err = mp_mul(&t1, x, x);
            if (err == MP_OKAY)
-               err = mp_montgomery_reduce(&x, modulus, *mp);
+               err = mp_montgomery_reduce(x, modulus, *mp);
 
            /* T1 = Z' * T1 */
            if (err == MP_OKAY)
@@ -645,15 +360,15 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 
            /* Y = Y * T1 */
            if (err == MP_OKAY)
-               err = mp_mul(&t1, &y, &y);
+               err = mp_mul(&t1, y, y);
            if (err == MP_OKAY)
-               err = mp_montgomery_reduce(&y, modulus, *mp);
+               err = mp_montgomery_reduce(y, modulus, *mp);
        }
    }
 
    /* T1 = Z*Z */
    if (err == MP_OKAY)
-       err = mp_sqr(&z, &t1);
+       err = mp_sqr(z, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
@@ -665,7 +380,7 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 
    /* T1 = Z * T1 */
    if (err == MP_OKAY)
-       err = mp_mul(&z, &t1, &t1);
+       err = mp_mul(z, &t1, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
@@ -677,10 +392,10 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 
    /* Y = Y - T1 */
    if (err == MP_OKAY)
-       err = mp_sub(&y, &t1, &y);
+       err = mp_sub(y, &t1, y);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(&y, 0) == MP_LT)
-           err = mp_add(&y, modulus, &y);
+       if (mp_cmp_d(y, 0) == MP_LT)
+           err = mp_add(y, modulus, y);
    }
    /* T1 = 2T1 */
    if (err == MP_OKAY)
@@ -691,17 +406,17 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
    }
    /* T1 = Y + T1 */
    if (err == MP_OKAY)
-       err = mp_add(&t1, &y, &t1);
+       err = mp_add(&t1, y, &t1);
    if (err == MP_OKAY) {
        if (mp_cmp(&t1, modulus) != MP_LT)
            err = mp_sub(&t1, modulus, &t1);
    }
    /* X = X - T2 */
    if (err == MP_OKAY)
-       err = mp_sub(&x, &t2, &x);
+       err = mp_sub(x, &t2, x);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(&x, 0) == MP_LT)
-           err = mp_add(&x, modulus, &x);
+       if (mp_cmp_d(x, 0) == MP_LT)
+           err = mp_add(x, modulus, x);
    }
    /* T2 = 2T2 */
    if (err == MP_OKAY)
@@ -712,7 +427,7 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
    }
    /* T2 = X + T2 */
    if (err == MP_OKAY)
-       err = mp_add(&t2, &x, &t2);
+       err = mp_add(&t2, x, &t2);
    if (err == MP_OKAY) {
        if (mp_cmp(&t2, modulus) != MP_LT)
            err = mp_sub(&t2, modulus, &t2);
@@ -721,103 +436,104 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
    if (err == MP_OKAY) {
        if (get_digit_count(Q->z)) {
            /* Z = Z * Z' */
-           err = mp_mul(&z, Q->z, &z);
+           err = mp_mul(z, Q->z, z);
            if (err == MP_OKAY)
-               err = mp_montgomery_reduce(&z, modulus, *mp);
+               err = mp_montgomery_reduce(z, modulus, *mp);
        }
    }
 
    /* Z = Z * X */
    if (err == MP_OKAY)
-       err = mp_mul(&z, &x, &z);
+       err = mp_mul(z, x, z);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(&z, modulus, *mp);
+       err = mp_montgomery_reduce(z, modulus, *mp);
 
    /* T1 = T1 * X  */
    if (err == MP_OKAY)
-       err = mp_mul(&t1, &x, &t1);
+       err = mp_mul(&t1, x, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
    /* X = X * X */
    if (err == MP_OKAY)
-       err = mp_sqr(&x, &x);
+       err = mp_sqr(x, x);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(&x, modulus, *mp);
+       err = mp_montgomery_reduce(x, modulus, *mp);
 
    /* T2 = T2 * x */
    if (err == MP_OKAY)
-       err = mp_mul(&t2, &x, &t2);
+       err = mp_mul(&t2, x, &t2);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t2, modulus, *mp);
 
    /* T1 = T1 * X  */
    if (err == MP_OKAY)
-       err = mp_mul(&t1, &x, &t1);
+       err = mp_mul(&t1, x, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
    /* X = Y*Y */
    if (err == MP_OKAY)
-       err = mp_sqr(&y, &x);
+       err = mp_sqr(y, x);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(&x, modulus, *mp);
+       err = mp_montgomery_reduce(x, modulus, *mp);
 
    /* X = X - T2 */
    if (err == MP_OKAY)
-       err = mp_sub(&x, &t2, &x);
+       err = mp_sub(x, &t2, x);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(&x, 0) == MP_LT)
-           err = mp_add(&x, modulus, &x);
+       if (mp_cmp_d(x, 0) == MP_LT)
+           err = mp_add(x, modulus, x);
    }
    /* T2 = T2 - X */
    if (err == MP_OKAY)
-       err = mp_sub(&t2, &x, &t2);
+       err = mp_sub(&t2, x, &t2);
    if (err == MP_OKAY) {
        if (mp_cmp_d(&t2, 0) == MP_LT)
            err = mp_add(&t2, modulus, &t2);
    }
    /* T2 = T2 - X */
    if (err == MP_OKAY)
-       err = mp_sub(&t2, &x, &t2);
+       err = mp_sub(&t2, x, &t2);
    if (err == MP_OKAY) {
        if (mp_cmp_d(&t2, 0) == MP_LT)
            err = mp_add(&t2, modulus, &t2);
    }
    /* T2 = T2 * Y */
    if (err == MP_OKAY)
-       err = mp_mul(&t2, &y, &t2);
+       err = mp_mul(&t2, y, &t2);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t2, modulus, *mp);
 
    /* Y = T2 - T1 */
    if (err == MP_OKAY)
-       err = mp_sub(&t2, &t1, &y);
+       err = mp_sub(&t2, &t1, y);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(&y, 0) == MP_LT)
-           err = mp_add(&y, modulus, &y);
+       if (mp_cmp_d(y, 0) == MP_LT)
+           err = mp_add(y, modulus, y);
    }
    /* Y = Y/2 */
    if (err == MP_OKAY) {
-       if (mp_isodd(&y))
-           err = mp_add(&y, modulus, &y);
+       if (mp_isodd(y))
+           err = mp_add(y, modulus, y);
    }
    if (err == MP_OKAY)
-       err = mp_div_2(&y, &y);
+       err = mp_div_2(y, y);
 
+#ifdef ALT_ECC_SIZE
    if (err == MP_OKAY)
-       err = mp_copy(&x, R->x);
+       err = mp_copy(x, R->x);
    if (err == MP_OKAY)
-       err = mp_copy(&y, R->y);
+       err = mp_copy(y, R->y);
    if (err == MP_OKAY)
-       err = mp_copy(&z, R->z);
+       err = mp_copy(z, R->z);
+#endif
 
+#ifndef USE_FAST_MATH
    /* clean up */
    mp_clear(&t1);
    mp_clear(&t2);
-   mp_clear(&x);
-   mp_clear(&y);
-   mp_clear(&z);
+#endif
 
    return err;
 }
@@ -834,8 +550,11 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
 int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
                              mp_digit* mp)
 {
-   mp_int t1;
-   mp_int t2;
+   mp_int t1, t2;
+#ifdef ALT_ECC_SIZE
+   mp_int rx, ry, rz;
+#endif
+   mp_int *x, *y, *z;
    int    err;
 
    if (P == NULL || R == NULL || modulus == NULL || mp == NULL)
@@ -845,44 +564,61 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
       return err;
    }
 
-   if (P != R) {
-      err = mp_copy(P->x, R->x);
-      if (err == MP_OKAY)
-          err = mp_copy(P->y, R->y);
-      if (err == MP_OKAY)
-          err = mp_copy(P->z, R->z);
+#ifdef ALT_ECC_SIZE
+   /* Use local stack variable */
+   x = &rx;
+   y = &ry;
+   z = &rz;
+
+   if ((err = mp_init_multi(x, y, z, NULL, NULL, NULL)) != MP_OKAY) {
+       mp_clear(&t1);
+       mp_clear(&t2);
+       return err;
    }
+#else
+   /* Use destination directly */
+   x = R->x;
+   y = R->y;
+   z = R->z;
+#endif
+
+   if (err == MP_OKAY)
+       err = mp_copy(P->x, x);
+   if (err == MP_OKAY)
+       err = mp_copy(P->y, y);
+   if (err == MP_OKAY)
+       err = mp_copy(P->z, z);
 
    /* t1 = Z * Z */
    if (err == MP_OKAY)
-       err = mp_sqr(R->z, &t1);
+       err = mp_sqr(z, &t1);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t1, modulus, *mp);
 
    /* Z = Y * Z */
    if (err == MP_OKAY)
-       err = mp_mul(R->z, R->y, R->z);
+       err = mp_mul(z, y, z);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->z, modulus, *mp);
+       err = mp_montgomery_reduce(z, modulus, *mp);
 
    /* Z = 2Z */
    if (err == MP_OKAY)
-       err = mp_add(R->z, R->z, R->z);
+       err = mp_add(z, z, z);
    if (err == MP_OKAY) {
-       if (mp_cmp(R->z, modulus) != MP_LT)
-           err = mp_sub(R->z, modulus, R->z);
+       if (mp_cmp(z, modulus) != MP_LT)
+           err = mp_sub(z, modulus, z);
    }
 
    /* T2 = X - T1 */
    if (err == MP_OKAY)
-       err = mp_sub(R->x, &t1, &t2);
+       err = mp_sub(x, &t1, &t2);
    if (err == MP_OKAY) {
        if (mp_cmp_d(&t2, 0) == MP_LT)
            err = mp_add(&t2, modulus, &t2);
    }
    /* T1 = X + T1 */
    if (err == MP_OKAY)
-       err = mp_add(&t1, R->x, &t1);
+       err = mp_add(&t1, x, &t1);
    if (err == MP_OKAY) {
        if (mp_cmp(&t1, modulus) != MP_LT)
            err = mp_sub(&t1, modulus, &t1);
@@ -909,20 +645,20 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
    }
    /* Y = 2Y */
    if (err == MP_OKAY)
-       err = mp_add(R->y, R->y, R->y);
+       err = mp_add(y, y, y);
    if (err == MP_OKAY) {
-       if (mp_cmp(R->y, modulus) != MP_LT)
-           err = mp_sub(R->y, modulus, R->y);
+       if (mp_cmp(y, modulus) != MP_LT)
+           err = mp_sub(y, modulus, y);
    }
    /* Y = Y * Y */
    if (err == MP_OKAY)
-       err = mp_sqr(R->y, R->y);
+       err = mp_sqr(y, y);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->y, modulus, *mp);
+       err = mp_montgomery_reduce(y, modulus, *mp);
 
    /* T2 = Y * Y */
    if (err == MP_OKAY)
-       err = mp_sqr(R->y, &t2);
+       err = mp_sqr(y, &t2);
    if (err == MP_OKAY)
        err = mp_montgomery_reduce(&t2, modulus, *mp);
 
@@ -936,59 +672,69 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
 
    /* Y = Y * X */
    if (err == MP_OKAY)
-       err = mp_mul(R->y, R->x, R->y);
+       err = mp_mul(y, x, y);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->y, modulus, *mp);
+       err = mp_montgomery_reduce(y, modulus, *mp);
 
    /* X  = T1 * T1 */
    if (err == MP_OKAY)
-       err = mp_sqr(&t1, R->x);
+       err = mp_sqr(&t1, x);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->x, modulus, *mp);
+       err = mp_montgomery_reduce(x, modulus, *mp);
 
    /* X = X - Y */
    if (err == MP_OKAY)
-       err = mp_sub(R->x, R->y, R->x);
+       err = mp_sub(x, y, x);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(R->x, 0) == MP_LT)
-           err = mp_add(R->x, modulus, R->x);
+       if (mp_cmp_d(x, 0) == MP_LT)
+           err = mp_add(x, modulus, x);
    }
    /* X = X - Y */
    if (err == MP_OKAY)
-       err = mp_sub(R->x, R->y, R->x);
+       err = mp_sub(x, y, x);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(R->x, 0) == MP_LT)
-           err = mp_add(R->x, modulus, R->x);
+       if (mp_cmp_d(x, 0) == MP_LT)
+           err = mp_add(x, modulus, x);
    }
    /* Y = Y - X */
    if (err == MP_OKAY)
-       err = mp_sub(R->y, R->x, R->y);
+       err = mp_sub(y, x, y);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(R->y, 0) == MP_LT)
-           err = mp_add(R->y, modulus, R->y);
+       if (mp_cmp_d(y, 0) == MP_LT)
+           err = mp_add(y, modulus, y);
    }
    /* Y = Y * T1 */
    if (err == MP_OKAY)
-       err = mp_mul(R->y, &t1, R->y);
+       err = mp_mul(y, &t1, y);
    if (err == MP_OKAY)
-       err = mp_montgomery_reduce(R->y, modulus, *mp);
+       err = mp_montgomery_reduce(y, modulus, *mp);
 
    /* Y = Y - T2 */
    if (err == MP_OKAY)
-       err = mp_sub(R->y, &t2, R->y);
+       err = mp_sub(y, &t2, y);
    if (err == MP_OKAY) {
-       if (mp_cmp_d(R->y, 0) == MP_LT)
-           err = mp_add(R->y, modulus, R->y);
+       if (mp_cmp_d(y, 0) == MP_LT)
+           err = mp_add(y, modulus, y);
    }
 
+#ifdef ALT_ECC_SIZE
+   if (err == MP_OKAY)
+       err = mp_copy(x, R->x);
+   if (err == MP_OKAY)
+       err = mp_copy(y, R->y);
+   if (err == MP_OKAY)
+       err = mp_copy(z, R->z);
+#endif
+
+#ifndef USE_FAST_MATH
    /* clean up */
    mp_clear(&t1);
    mp_clear(&t2);
+#endif
 
    return err;
 }
 
-#endif /* USE_FAST_MATH */
 
 /**
   Map a projective jacbobian point back to affine space

From 44b1f98b39e242c4a845f8b4150cf32dcdeb31f1 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Fri, 6 May 2016 10:59:32 -0700
Subject: [PATCH 12/16] Fixed issue with ALT_ECC_SIZE and default value for
 FP_MAX_BITS_ECC so its based on max enabled ECC curve bits.

---
 wolfssl/wolfcrypt/ecc.h | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/wolfssl/wolfcrypt/ecc.h b/wolfssl/wolfcrypt/ecc.h
index b952e68b9..9f97b902c 100644
--- a/wolfssl/wolfcrypt/ecc.h
+++ b/wolfssl/wolfcrypt/ecc.h
@@ -61,6 +61,27 @@ typedef struct {
 } ecc_set_type;
 
 
+/* Determine max ECC bits based on enabled curves */
+#if defined(HAVE_ECC521) || defined(HAVE_ALL_CURVES)
+    #define MAX_ECC_BITS    528
+#elif defined(HAVE_ECC384)
+    #define MAX_ECC_BITS    384
+#elif defined(HAVE_ECC224)
+    #define MAX_ECC_BITS    224
+#elif !defined(NO_ECC256)
+    #define MAX_ECC_BITS    256
+#elif defined(HAVE_ECC192)
+    #define MAX_ECC_BITS    192
+#elif defined(HAVE_ECC160)
+    #define MAX_ECC_BITS    160
+#elif defined(HAVE_ECC128)
+    #define MAX_ECC_BITS    128
+#elif defined(HAVE_ECC112)
+    #define MAX_ECC_BITS    112
+#endif
+
+
+
 #ifdef ALT_ECC_SIZE
 
 /* Note on ALT_ECC_SIZE:
@@ -91,12 +112,16 @@ typedef struct {
 #endif
 
 #ifndef FP_MAX_BITS_ECC
-    #define FP_MAX_BITS_ECC           528
+    /* This value should be double the max ecc bit size */
+    #define FP_MAX_BITS_ECC       (MAX_ECC_BITS*2)
 #endif
+
 #define FP_MAX_SIZE_ECC           (FP_MAX_BITS_ECC+(8*DIGIT_BIT))
+
 #if FP_MAX_BITS_ECC % CHAR_BIT
    #error FP_MAX_BITS_ECC must be a multiple of CHAR_BIT
 #endif
+
 #define FP_SIZE_ECC    (FP_MAX_SIZE_ECC/DIGIT_BIT)
 
 /* This needs to match the size of the fp_int struct, except the

From 8c9b8a596ad61a8c1f0d3780350240979e49b784 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Mon, 9 May 2016 09:50:51 -0700
Subject: [PATCH 13/16] Fixed calculation of max ECC bits with ALT_ECC_SIZE
 defined so it only allocates what is required. For 8-bit aligned curve sizes
 its double the max bits. For un-aligned curves sized, like ECC521, its 521
 8-bit aligned, doubled, plus digit bit.

---
 wolfssl/wolfcrypt/ecc.h | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/wolfssl/wolfcrypt/ecc.h b/wolfssl/wolfcrypt/ecc.h
index 9f97b902c..f9ca3b41a 100644
--- a/wolfssl/wolfcrypt/ecc.h
+++ b/wolfssl/wolfcrypt/ecc.h
@@ -63,7 +63,7 @@ typedef struct {
 
 /* Determine max ECC bits based on enabled curves */
 #if defined(HAVE_ECC521) || defined(HAVE_ALL_CURVES)
-    #define MAX_ECC_BITS    528
+    #define MAX_ECC_BITS    521
 #elif defined(HAVE_ECC384)
     #define MAX_ECC_BITS    384
 #elif defined(HAVE_ECC224)
@@ -111,18 +111,26 @@ typedef struct {
     #error USE_FAST_MATH must be defined to use ALT_ECC_SIZE
 #endif
 
+/* determine max bits required for ECC math */
 #ifndef FP_MAX_BITS_ECC
-    /* This value should be double the max ecc bit size */
-    #define FP_MAX_BITS_ECC       (MAX_ECC_BITS*2)
+    /* check alignment */
+    #if ((MAX_ECC_BITS & CHAR_BIT) == 0)
+        /* max bits is double */
+        #define FP_MAX_BITS_ECC     (MAX_ECC_BITS * 2)
+    #else
+        /* max bits is rounded up to 8-bit alignment, doubled, plus one digit of fudge */
+        #define FP_MAX_BITS_ECC     ((((MAX_ECC_BITS + CHAR_BIT) & ~CHAR_BIT) * 2) + DIGIT_BIT)
+    #endif
 #endif
 
-#define FP_MAX_SIZE_ECC           (FP_MAX_BITS_ECC+(8*DIGIT_BIT))
-
+/* verify alignment */
 #if FP_MAX_BITS_ECC % CHAR_BIT
    #error FP_MAX_BITS_ECC must be a multiple of CHAR_BIT
 #endif
 
-#define FP_SIZE_ECC    (FP_MAX_SIZE_ECC/DIGIT_BIT)
+/* determine buffer size */
+#define FP_SIZE_ECC    (FP_MAX_BITS_ECC/DIGIT_BIT)
+
 
 /* This needs to match the size of the fp_int struct, except the
  * fp_digit array will be shorter. */

From 8f6352725a8312c2a88d80132c7a8b250f6680cb Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Mon, 9 May 2016 10:34:37 -0700
Subject: [PATCH 14/16] Fixed math for FP_MAX_BITS_ECC calculations. Error in
 alignment check. Altered non-aligned formula to be (max bits * 2) + digit,
 then 8-bit aligned. Cleanup of the example user_settings.h.

---
 IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h | 7 ++++---
 wolfssl/wolfcrypt/ecc.h                   | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h b/IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h
index 7947ef34e..f8d751ff0 100644
--- a/IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h
+++ b/IDE/ROWLEY-CROSSWORKS-ARM/user_settings.h
@@ -79,12 +79,13 @@ extern "C" {
     #define ECC_TIMING_RESISTANT
 
     #ifdef USE_FAST_MATH
-        /* Max ECC bits (curve size * 8). ECC256 is (32*8) = 256 */
-        /* Note: ECC521 requires (curve size * 16): (66*16) = 1056 */
+        /* use reduced size math buffers for ecc points */
         #undef  ALT_ECC_SIZE
         #define ALT_ECC_SIZE
+
+        /* optionally override the default max ecc bits */
         #undef  FP_MAX_BITS_ECC
-        #define FP_MAX_BITS_ECC     1056
+        //#define FP_MAX_BITS_ECC     512
 
         /* Enable TFM optimizations for ECC */
         #define TFM_ECC192
diff --git a/wolfssl/wolfcrypt/ecc.h b/wolfssl/wolfcrypt/ecc.h
index f9ca3b41a..a1bc1f61a 100644
--- a/wolfssl/wolfcrypt/ecc.h
+++ b/wolfssl/wolfcrypt/ecc.h
@@ -114,12 +114,12 @@ typedef struct {
 /* determine max bits required for ECC math */
 #ifndef FP_MAX_BITS_ECC
     /* check alignment */
-    #if ((MAX_ECC_BITS & CHAR_BIT) == 0)
+    #if (MAX_ECC_BITS % CHAR_BIT) == 0
         /* max bits is double */
         #define FP_MAX_BITS_ECC     (MAX_ECC_BITS * 2)
     #else
-        /* max bits is rounded up to 8-bit alignment, doubled, plus one digit of fudge */
-        #define FP_MAX_BITS_ECC     ((((MAX_ECC_BITS + CHAR_BIT) & ~CHAR_BIT) * 2) + DIGIT_BIT)
+        /* max bits is doubled, plus one digit of fudge then 8-bit aligned */
+        #define FP_MAX_BITS_ECC     (((MAX_ECC_BITS * 2) + DIGIT_BIT) & ~(CHAR_BIT-1))
     #endif
 #endif
 

From d71d0f2cb42df5bd8798f0c18ca84ce7dfb371c4 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Mon, 9 May 2016 13:29:25 -0700
Subject: [PATCH 15/16] Fix with fast math disabled so ecc_projective_add_point
 uses temp local variable for x,y,z result.

---
 wolfcrypt/src/ecc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c
index cf5989344..9167eac36 100644
--- a/wolfcrypt/src/ecc.c
+++ b/wolfcrypt/src/ecc.c
@@ -281,7 +281,7 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
                              mp_int* modulus, mp_digit* mp)
 {
    mp_int t1, t2;
-#ifdef ALT_ECC_SIZE
+#if (defined(USE_FAST_MATH) && defined(ALT_ECC_SIZE)) || !defined(USE_FAST_MATH)
    mp_int rx, ry, rz;
 #endif
    mp_int *x, *y, *z;
@@ -313,7 +313,7 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
       return err;
    }
 
-#ifdef ALT_ECC_SIZE
+#if (defined(USE_FAST_MATH) && defined(ALT_ECC_SIZE)) || !defined(USE_FAST_MATH)
    /* Use local stack variable */
    x = &rx;
    y = &ry;
@@ -520,7 +520,7 @@ int ecc_projective_add_point(ecc_point* P, ecc_point* Q, ecc_point* R,
    if (err == MP_OKAY)
        err = mp_div_2(y, y);
 
-#ifdef ALT_ECC_SIZE
+#if (defined(USE_FAST_MATH) && defined(ALT_ECC_SIZE)) || !defined(USE_FAST_MATH)
    if (err == MP_OKAY)
        err = mp_copy(x, R->x);
    if (err == MP_OKAY)

From 2fb45069225330d9ecce95a2795d4e6d50bd0f57 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Tue, 10 May 2016 12:20:39 -0700
Subject: [PATCH 16/16] iFixes to FP_MAX_BITS_ECC calculation. Alignment check
 against digit_bits is based on max ecc bits times two. If alignment check
 fails we add a digit_bit to make sure we have enough room.

---
 wolfssl/wolfcrypt/ecc.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/wolfssl/wolfcrypt/ecc.h b/wolfssl/wolfcrypt/ecc.h
index a1bc1f61a..e67a49720 100644
--- a/wolfssl/wolfcrypt/ecc.h
+++ b/wolfssl/wolfcrypt/ecc.h
@@ -114,18 +114,18 @@ typedef struct {
 /* determine max bits required for ECC math */
 #ifndef FP_MAX_BITS_ECC
     /* check alignment */
-    #if (MAX_ECC_BITS % CHAR_BIT) == 0
+    #if ((MAX_ECC_BITS * 2) % DIGIT_BIT) == 0
         /* max bits is double */
         #define FP_MAX_BITS_ECC     (MAX_ECC_BITS * 2)
     #else
-        /* max bits is doubled, plus one digit of fudge then 8-bit aligned */
-        #define FP_MAX_BITS_ECC     (((MAX_ECC_BITS * 2) + DIGIT_BIT) & ~(CHAR_BIT-1))
+        /* max bits is doubled, plus one digit of fudge */
+        #define FP_MAX_BITS_ECC     ((MAX_ECC_BITS * 2) + DIGIT_BIT)
+    #endif
+#else
+    /* verify alignment */
+    #if FP_MAX_BITS_ECC % CHAR_BIT
+       #error FP_MAX_BITS_ECC must be a multiple of CHAR_BIT
     #endif
-#endif
-
-/* verify alignment */
-#if FP_MAX_BITS_ECC % CHAR_BIT
-   #error FP_MAX_BITS_ECC must be a multiple of CHAR_BIT
 #endif
 
 /* determine buffer size */