diff --git a/configure.ac b/configure.ac
index 444c1c5c2..ba94a299d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -557,7 +557,7 @@ AC_ARG_ENABLE([ecc],
 
 if test "$ENABLED_ECC" = "yes"
 then
-    AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256"
+    AM_CFLAGS="$AM_CFLAGS -DHAVE_ECC -DTFM_ECC256 -DECC_SHAMIR"
 fi
 
 AM_CONDITIONAL([BUILD_ECC], [test "x$ENABLED_ECC" = "xyes"])
diff --git a/ctaocrypt/src/ecc.c b/ctaocrypt/src/ecc.c
index 2698b0555..bc4c9c3a7 100644
--- a/ctaocrypt/src/ecc.c
+++ b/ctaocrypt/src/ecc.c
@@ -178,6 +178,290 @@ static unsigned long get_digit(mp_int* a, int n)
 }
 
 
+#ifdef USE_FAST_MATH
+
+/* fast math accelerated version */
+
+/**
+   Add two ECC points
+   P        The point to add
+   Q        The point to add
+   R        [out] The destination of the double
+   modulus  The modulus of the field the ECC curve is in
+   mp       The "b" value from montgomery_setup()
+   return   MP_OKAY on success
+*/
+int ecc_projective_add_point(ecc_point *P, ecc_point *Q, ecc_point *R,
+                             mp_int* modulus, mp_digit* mp)
+{
+   mp_int t1, t2, x, y, z;
+   int    err;
+
+   if (P == NULL || Q == NULL || R == NULL || modulus == NULL || mp == NULL)
+       return ECC_BAD_ARG_E;
+
+   if ((err = mp_init_multi(&t1, &t2, &x, &y, &z, NULL)) != MP_OKAY) {
+      return err;
+   }
+
+   /* should we dbl instead? */
+   fp_sub(modulus, &Q->y, &t1);
+   if ( (fp_cmp(&P->x, &Q->x) == FP_EQ) && 
+        (&Q->z != NULL && fp_cmp(&P->z, &Q->z) == FP_EQ) &&
+        (fp_cmp(&P->y, &Q->y) == FP_EQ || fp_cmp(&P->y, &t1) == FP_EQ)) {
+        return ecc_projective_dbl_point(P, R, modulus, mp);
+   }
+
+   fp_copy(&P->x, &x);
+   fp_copy(&P->y, &y);
+   fp_copy(&P->z, &z);
+
+   /* if Z is one then these are no-operations */
+   if (&Q->z != NULL) {
+      /* T1 = Z' * Z' */
+      fp_sqr(&Q->z, &t1);
+      fp_montgomery_reduce(&t1, modulus, *mp);
+      /* X = X * T1 */
+      fp_mul(&t1, &x, &x);
+      fp_montgomery_reduce(&x, modulus, *mp);
+      /* T1 = Z' * T1 */
+      fp_mul(&Q->z, &t1, &t1);
+      fp_montgomery_reduce(&t1, modulus, *mp);
+      /* Y = Y * T1 */
+      fp_mul(&t1, &y, &y);
+      fp_montgomery_reduce(&y, modulus, *mp);
+   }
+
+   /* T1 = Z*Z */
+   fp_sqr(&z, &t1);
+   fp_montgomery_reduce(&t1, modulus, *mp);
+   /* T2 = X' * T1 */
+   fp_mul(&Q->x, &t1, &t2);
+   fp_montgomery_reduce(&t2, modulus, *mp);
+   /* T1 = Z * T1 */
+   fp_mul(&z, &t1, &t1);
+   fp_montgomery_reduce(&t1, modulus, *mp);
+   /* T1 = Y' * T1 */
+   fp_mul(&Q->y, &t1, &t1);
+   fp_montgomery_reduce(&t1, modulus, *mp);
+
+   /* Y = Y - T1 */
+   fp_sub(&y, &t1, &y);
+   if (fp_cmp_d(&y, 0) == FP_LT) {
+      fp_add(&y, modulus, &y);
+   }
+   /* T1 = 2T1 */
+   fp_add(&t1, &t1, &t1);
+   if (fp_cmp(&t1, modulus) != FP_LT) {
+      fp_sub(&t1, modulus, &t1);
+   }
+   /* T1 = Y + T1 */
+   fp_add(&t1, &y, &t1);
+   if (fp_cmp(&t1, modulus) != FP_LT) {
+      fp_sub(&t1, modulus, &t1);
+   }
+   /* X = X - T2 */
+   fp_sub(&x, &t2, &x);
+   if (fp_cmp_d(&x, 0) == FP_LT) {
+      fp_add(&x, modulus, &x);
+   }
+   /* T2 = 2T2 */
+   fp_add(&t2, &t2, &t2);
+   if (fp_cmp(&t2, modulus) != FP_LT) {
+      fp_sub(&t2, modulus, &t2);
+   }
+   /* T2 = X + T2 */
+   fp_add(&t2, &x, &t2);
+   if (fp_cmp(&t2, modulus) != FP_LT) {
+      fp_sub(&t2, modulus, &t2);
+   }
+
+   /* if Z' != 1 */
+   if (&Q->z != NULL) {
+      /* Z = Z * Z' */
+      fp_mul(&z, &Q->z, &z);
+      fp_montgomery_reduce(&z, modulus, *mp);
+   }
+
+   /* Z = Z * X */
+   fp_mul(&z, &x, &z);
+   fp_montgomery_reduce(&z, modulus, *mp);
+
+   /* T1 = T1 * X  */
+   fp_mul(&t1, &x, &t1);
+   fp_montgomery_reduce(&t1, modulus, *mp);
+   /* X = X * X */
+   fp_sqr(&x, &x);
+   fp_montgomery_reduce(&x, modulus, *mp);
+   /* T2 = T2 * x */
+   fp_mul(&t2, &x, &t2);
+   fp_montgomery_reduce(&t2, modulus, *mp);
+   /* T1 = T1 * X  */
+   fp_mul(&t1, &x, &t1);
+   fp_montgomery_reduce(&t1, modulus, *mp);
+ 
+   /* X = Y*Y */
+   fp_sqr(&y, &x);
+   fp_montgomery_reduce(&x, modulus, *mp);
+   /* X = X - T2 */
+   fp_sub(&x, &t2, &x);
+   if (fp_cmp_d(&x, 0) == FP_LT) {
+      fp_add(&x, modulus, &x);
+   }
+
+   /* T2 = T2 - X */
+   fp_sub(&t2, &x, &t2);
+   if (fp_cmp_d(&t2, 0) == FP_LT) {
+      fp_add(&t2, modulus, &t2);
+   } 
+   /* T2 = T2 - X */
+   fp_sub(&t2, &x, &t2);
+   if (fp_cmp_d(&t2, 0) == FP_LT) {
+      fp_add(&t2, modulus, &t2);
+   }
+   /* T2 = T2 * Y */
+   fp_mul(&t2, &y, &t2);
+   fp_montgomery_reduce(&t2, modulus, *mp);
+   /* Y = T2 - T1 */
+   fp_sub(&t2, &t1, &y);
+   if (fp_cmp_d(&y, 0) == FP_LT) {
+      fp_add(&y, modulus, &y);
+   }
+   /* Y = Y/2 */
+   if (fp_isodd(&y)) {
+      fp_add(&y, modulus, &y);
+   }
+   fp_div_2(&y, &y);
+
+   fp_copy(&x, &R->x);
+   fp_copy(&y, &R->y);
+   fp_copy(&z, &R->z);
+   
+   return MP_OKAY;
+}
+
+
+/**
+   Double an ECC point
+   P   The point to double
+   R   [out] The destination of the double
+   modulus  The modulus of the field the ECC curve is in
+   mp       The "b" value from montgomery_setup()
+   return   MP_OKAY on success
+*/
+int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
+                             mp_digit* mp)
+{
+   mp_int   t1, t2;
+   int      err;
+
+   if (P == NULL || R == NULL || modulus == NULL || mp == NULL)
+       return ECC_BAD_ARG_E;
+
+   if (P != R) {
+      err = mp_copy(&P->x, &R->x);
+      if (err == MP_OKAY)
+        err = mp_copy(&P->y, &R->y);
+      if (err == MP_OKAY)
+        err = mp_copy(&P->z, &R->z);
+
+      if (err != MP_OKAY)
+          return err;
+   }
+
+   if ((err = mp_init_multi(&t1, &t2, NULL, NULL, NULL, NULL)) != MP_OKAY) {
+      return err;
+   }
+
+   /* t1 = Z * Z */
+   mp_sqr(&R->z, &t1);
+   mp_montgomery_reduce(&t1, modulus, *mp);
+   /* Z = Y * Z */
+   mp_mul(&R->z, &R->y, &R->z);
+   mp_montgomery_reduce(&R->z, modulus, *mp);
+   /* Z = 2Z */
+   mp_add(&R->z, &R->z, &R->z);
+   if (mp_cmp(&R->z, modulus) != FP_LT) {
+      mp_sub(&R->z, modulus, &R->z);
+   }
+   
+   /* &t2 = X - T1 */
+   mp_sub(&R->x, &t1, &t2);
+   if (mp_cmp_d(&t2, 0) == FP_LT) {
+      mp_add(&t2, modulus, &t2);
+   }
+   /* T1 = X + T1 */
+   mp_add(&t1, &R->x, &t1);
+   if (mp_cmp(&t1, modulus) != FP_LT) {
+      mp_sub(&t1, modulus, &t1);
+   }
+   /* T2 = T1 * T2 */
+   mp_mul(&t1, &t2, &t2);
+   mp_montgomery_reduce(&t2, modulus, *mp);
+   /* T1 = 2T2 */
+   mp_add(&t2, &t2, &t1);
+   if (mp_cmp(&t1, modulus) != FP_LT) {
+      mp_sub(&t1, modulus, &t1);
+   }
+   /* T1 = T1 + T2 */
+   mp_add(&t1, &t2, &t1);
+   if (mp_cmp(&t1, modulus) != FP_LT) {
+      mp_sub(&t1, modulus, &t1);
+   }
+
+   /* Y = 2Y */
+   mp_add(&R->y, &R->y, &R->y);
+   if (mp_cmp(&R->y, modulus) != FP_LT) {
+      mp_sub(&R->y, modulus, &R->y);
+   }
+   /* Y = Y * Y */
+   mp_sqr(&R->y, &R->y);
+   mp_montgomery_reduce(&R->y, modulus, *mp);
+   /* T2 = Y * Y */
+   mp_sqr(&R->y, &t2);
+   mp_montgomery_reduce(&t2, modulus, *mp);
+   /* T2 = T2/2 */
+   if (mp_isodd(&t2)) {
+      mp_add(&t2, modulus, &t2);
+   }
+   mp_div_2(&t2, &t2);
+   /* Y = Y * X */
+   mp_mul(&R->y, &R->x, &R->y);
+   mp_montgomery_reduce(&R->y, modulus, *mp);
+
+   /* X  = T1 * T1 */
+   mp_sqr(&t1, &R->x);
+   mp_montgomery_reduce(&R->x, modulus, *mp);
+   /* X = X - Y */
+   mp_sub(&R->x, &R->y, &R->x);
+   if (mp_cmp_d(&R->x, 0) == FP_LT) {
+      mp_add(&R->x, modulus, &R->x);
+   }
+   /* X = X - Y */
+   mp_sub(&R->x, &R->y, &R->x);
+   if (mp_cmp_d(&R->x, 0) == FP_LT) {
+      mp_add(&R->x, modulus, &R->x);
+   }
+
+   /* Y = Y - X */     
+   mp_sub(&R->y, &R->x, &R->y);
+   if (mp_cmp_d(&R->y, 0) == FP_LT) {
+      mp_add(&R->y, modulus, &R->y);
+   }
+   /* Y = Y * T1 */
+   mp_mul(&R->y, &t1, &R->y);
+   mp_montgomery_reduce(&R->y, modulus, *mp);
+   /* Y = Y - T2 */
+   mp_sub(&R->y, &t2, &R->y);
+   if (mp_cmp_d(&R->y, 0) == FP_LT) {
+      mp_add(&R->y, modulus, &R->y);
+   }
+ 
+   return err;
+}
+
+#else /* USE_FAST_MATH */
+
 /**
    Add two ECC points
    P        The point to add
@@ -593,6 +877,7 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* modulus,
    return err;
 }
 
+#endif /* USE_FAST_MATH */
 
 /**
   Map a projective jacbobian point back to affine space
@@ -1225,6 +1510,234 @@ void ecc_free(ecc_key* key)
 }
 
 
+#ifdef ECC_SHAMIR
+
+#ifdef USE_FAST_MATH
+    #define GEN_MEM_ERR FP_MEM
+#else
+    #define GEN_MEM_ERR MP_MEM
+#endif
+
+/** Computes kA*A + kB*B = C using Shamir's Trick
+  A        First point to multiply
+  kA       What to multiple A by
+  B        Second point to multiply
+  kB       What to multiple B by
+  C        [out] Destination point (can overlap with A or B)
+  modulus  Modulus for curve 
+  return MP_OKAY on success
+*/
+static int ecc_mul2add(ecc_point* A, mp_int* kA,
+                    ecc_point* B, mp_int* kB,
+                    ecc_point* C, mp_int* modulus)
+{
+  ecc_point*     precomp[16];
+  unsigned       bitbufA, bitbufB, lenA, lenB, len, x, y, nA, nB, nibble;
+  unsigned char* tA;
+  unsigned char* tB;
+  int            err = MP_OKAY, first;
+  int            muInit    = 0;
+  int            tableInit = 0;
+  mp_digit mp;
+  mp_int   mu;
+ 
+  /* argchks */
+  if (A == NULL || kA == NULL || B == NULL || kB == NULL || C == NULL || 
+                   modulus == NULL)
+    return ECC_BAD_ARG_E;
+
+
+  /* allocate memory */
+  tA = XMALLOC(ECC_BUFSIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (tA == NULL) {
+     return GEN_MEM_ERR;
+  }
+  tB = XMALLOC(ECC_BUFSIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+  if (tB == NULL) {
+     XFREE(tA, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+     return GEN_MEM_ERR;
+  }
+
+  /* get sizes */
+  lenA = mp_unsigned_bin_size(kA);
+  lenB = mp_unsigned_bin_size(kB);
+  len  = MAX(lenA, lenB);
+
+  /* sanity check */
+  if ((lenA > ECC_BUFSIZE) || (lenB > ECC_BUFSIZE)) {
+     err = BAD_FUNC_ARG;
+  }
+
+  if (err == MP_OKAY) {
+    /* extract and justify kA */
+    mp_to_unsigned_bin(kA, (len - lenA) + tA);
+
+    /* extract and justify kB */
+    mp_to_unsigned_bin(kB, (len - lenB) + tB);
+
+    /* allocate the table */
+    for (x = 0; x < 16; x++) {
+        precomp[x] = ecc_new_point();
+        if (precomp[x] == NULL) {
+            for (y = 0; y < x; ++y) {
+                ecc_del_point(precomp[y]);
+            }
+            err = GEN_MEM_ERR;
+            break;
+        }
+    }
+  }
+
+  if (err == MP_OKAY)
+    tableInit = 1;
+
+  if (err == MP_OKAY)
+   /* init montgomery reduction */
+   err = mp_montgomery_setup(modulus, &mp);
+
+  if (err == MP_OKAY)
+    err = mp_init(&mu);
+  if (err == MP_OKAY)
+    muInit = 1;
+
+  if (err == MP_OKAY)
+    err = mp_montgomery_calc_normalization(&mu, modulus);
+
+  if (err == MP_OKAY)
+    /* copy ones ... */
+    err = mp_mulmod(&A->x, &mu, modulus, &precomp[1]->x);
+
+  if (err == MP_OKAY)
+    err = mp_mulmod(&A->y, &mu, modulus, &precomp[1]->y);
+  if (err == MP_OKAY)
+    err = mp_mulmod(&A->z, &mu, modulus, &precomp[1]->z);
+
+  if (err == MP_OKAY)
+    err = mp_mulmod(&B->x, &mu, modulus, &precomp[1<<2]->x);
+  if (err == MP_OKAY)
+    err = mp_mulmod(&B->y, &mu, modulus, &precomp[1<<2]->y);
+  if (err == MP_OKAY)
+    err = mp_mulmod(&B->z, &mu, modulus, &precomp[1<<2]->z);
+
+  if (err == MP_OKAY)
+    /* precomp [i,0](A + B) table */
+    err = ecc_projective_dbl_point(precomp[1], precomp[2], modulus, &mp);
+
+  if (err == MP_OKAY)
+    err = ecc_projective_add_point(precomp[1], precomp[2], precomp[3],
+                                   modulus, &mp);
+  if (err == MP_OKAY)
+    /* precomp [0,i](A + B) table */
+    err = ecc_projective_dbl_point(precomp[1<<2], precomp[2<<2], modulus, &mp);
+
+  if (err == MP_OKAY)
+    err = ecc_projective_add_point(precomp[1<<2], precomp[2<<2], precomp[3<<2],
+                                   modulus, &mp);
+
+  if (err == MP_OKAY) {
+    /* precomp [i,j](A + B) table (i != 0, j != 0) */
+    for (x = 1; x < 4; x++) {
+        for (y = 1; y < 4; y++) {
+            if (err == MP_OKAY)
+                err = ecc_projective_add_point(precomp[x], precomp[(y<<2)],
+                                               precomp[x+(y<<2)], modulus, &mp);
+        }
+    } 
+  }  
+
+  if (err == MP_OKAY) {
+    nibble  = 3;
+    first   = 1;
+    bitbufA = tA[0];
+    bitbufB = tB[0];
+
+    /* for every byte of the multiplicands */
+    for (x = -1;; ) {
+        /* grab a nibble */
+        if (++nibble == 4) {
+            ++x; if (x == len) break;
+            bitbufA = tA[x];
+            bitbufB = tB[x];
+            nibble  = 0;
+        }
+
+        /* extract two bits from both, shift/update */
+        nA = (bitbufA >> 6) & 0x03;
+        nB = (bitbufB >> 6) & 0x03;
+        bitbufA = (bitbufA << 2) & 0xFF;   
+        bitbufB = (bitbufB << 2) & 0xFF;   
+
+        /* if both zero, if first, continue */
+        if ((nA == 0) && (nB == 0) && (first == 1)) {
+            continue;
+        }
+
+        /* double twice, only if this isn't the first */
+        if (first == 0) {
+            /* double twice */
+            if (err == MP_OKAY)
+                err = ecc_projective_dbl_point(C, C, modulus, &mp);
+            if (err == MP_OKAY)
+                err = ecc_projective_dbl_point(C, C, modulus, &mp);
+            else
+                break;
+        }
+
+        /* if not both zero */
+        if ((nA != 0) || (nB != 0)) {
+            if (first == 1) {
+                /* if first, copy from table */
+                first = 0;
+                if (err == MP_OKAY)
+                    err = mp_copy(&precomp[nA + (nB<<2)]->x, &C->x);
+
+                if (err == MP_OKAY)
+                    err = mp_copy(&precomp[nA + (nB<<2)]->y, &C->y);
+
+                if (err == MP_OKAY)
+                    err = mp_copy(&precomp[nA + (nB<<2)]->z, &C->z);
+                else
+                    break;
+            } else {
+                /* if not first, add from table */
+                if (err == MP_OKAY)
+                    err = ecc_projective_add_point(C, precomp[nA + (nB<<2)], C,
+                                                   modulus, &mp);
+                else
+                    break;
+            }
+        }
+    }
+  }
+
+  if (err == MP_OKAY)
+    /* reduce to affine */
+    err = ecc_map(C, modulus, &mp);
+
+  /* clean up */
+  if (muInit)
+    mp_clear(&mu);
+
+  if (tableInit) {
+    for (x = 0; x < 16; x++) {
+       ecc_del_point(precomp[x]);
+    }
+  }
+#ifdef LTC_CLEAN_STACK
+   XMEMSET(tA, 0, ECC_BUF_SIZE);
+   XMEMSET(tB, 0, ECC_BUF_SIZE);
+#endif
+   XFREE(tA, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+   XFREE(tB, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+
+   return err;
+}
+
+
+#endif /* ECC_SHAMIR */
+
+
+
 /* verify 
  *
  * w  = s^-1 mod n
@@ -1258,7 +1771,6 @@ int ecc_verify_hash(const byte* sig, word32 siglen, const byte* hash,
    mp_int        e;
    mp_int        p;
    mp_int        m;
-   mp_digit      mp;
    int           err;
 
    if (sig == NULL || hash == NULL || stat == NULL || key == NULL)
@@ -1360,6 +1872,9 @@ int ecc_verify_hash(const byte* sig, word32 siglen, const byte* hash,
        err = mp_copy(&key->pubkey.z, &mQ->z);
 
 #ifndef ECC_SHAMIR
+    {
+       mp_digit      mp;
+
        /* compute u1*mG + u2*mQ = mG */
        if (err == MP_OKAY)
            err = ecc_mulmod(&u1, mG, mG, &m, 0);
@@ -1377,6 +1892,7 @@ int ecc_verify_hash(const byte* sig, word32 siglen, const byte* hash,
        /* reduce */
        if (err == MP_OKAY)
            err = ecc_map(mG, &m, &mp);
+    }
 #else
        /* use Shamir's trick to compute u1*mG + u2*mQ using half the doubles */
        if (err == MP_OKAY)