From 6ef9e79ff5ccd2b96fdfed404ada872fd29514be Mon Sep 17 00:00:00 2001
From: toddouska <todd@wolfssl.com>
Date: Tue, 13 Sep 2016 09:13:39 -0700
Subject: [PATCH 1/2] switch timing resistant exptmod to use temp for square
 instead of leaking key bit to cache monitor

---
 wolfcrypt/src/tfm.c       | 31 +++++++++++++++++++++++++++++--
 wolfssl/wolfcrypt/types.h |  1 +
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index 35364334a..06a4846e4 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -1035,13 +1035,29 @@ int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 
 #ifdef TFM_TIMING_RESISTANT
 
+/* all off / all on pointer addresses for constant calculations */
+static const wolfssl_word off_on_addr[2] =
+{
+#if defined(WC_64BIT_CPU)
+    W64LIT(0x0000000000000000),
+    W64LIT(0xffffffffffffffff)
+#elif defined(WC_16BIT_CPU)
+    0x0000U,
+    0xffffU
+#else
+    /* 32 bit */
+    0x00000000U,
+    0xffffffffU
+#endif
+};
+
 /* timing resistant montgomery ladder based exptmod
    Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder",
    Cryptographic Hardware and Embedded Systems, CHES 2002
 */
 static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 {
-  fp_int   R[2];
+  fp_int   R[3];
   fp_digit buf, mp;
   int      err, bitcnt, digidx, y;
 
@@ -1052,6 +1068,7 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 
   fp_init(&R[0]);
   fp_init(&R[1]);
+  fp_init(&R[2]);
 
   /* now we need R mod m */
   fp_montgomery_calc_normalization (&R[0], P);
@@ -1092,7 +1109,17 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 
     /* do ops */
     fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
-    fp_sqr(&R[y], &R[y]);          fp_montgomery_reduce(&R[y], P, mp);
+
+    /* instead of using R[y] for sqr, which leaks key bit to cache monitor,
+     * use R[2] as temp, make sure address calc is constant, keep
+     * &R[0] and &R[1] in cache */
+    fp_copy((fp_int*) ( ((wolfssl_word)&R[0] & off_on_addr[(y^1)]) +
+                        ((wolfssl_word)&R[1] & off_on_addr[y]) ),
+            &R[2]);
+    fp_sqr(&R[2], &R[2]);          fp_montgomery_reduce(&R[2], P, mp);
+    fp_copy(&R[2],
+            (fp_int*) ( ((wolfssl_word)&R[0] & off_on_addr[(y^1)]) +
+                        ((wolfssl_word)&R[1] & off_on_addr[y]) ) );
   }
 
    fp_montgomery_reduce(&R[0], P, mp);
diff --git a/wolfssl/wolfcrypt/types.h b/wolfssl/wolfcrypt/types.h
index d67453601..7612546c0 100644
--- a/wolfssl/wolfcrypt/types.h
+++ b/wolfssl/wolfcrypt/types.h
@@ -91,6 +91,7 @@
 	     defined(__mips64)  || defined(__x86_64__) || defined(_M_X64)) || \
          defined(__aarch64__)
 	    typedef word64 wolfssl_word;
+        #define WC_64BIT_CPU
 	#else
 	    typedef word32 wolfssl_word;
 	    #ifdef WORD64_AVAILABLE

From 46a0ee8e690913a41a10f3c296cfab8345500218 Mon Sep 17 00:00:00 2001
From: toddouska <todd@wolfssl.com>
Date: Tue, 13 Sep 2016 11:10:10 -0700
Subject: [PATCH 2/2] switch ecc timising resistant mulmod double to use temp
 instead of leaking key bit to cache monitor

---
 wolfcrypt/src/ecc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++-
 wolfcrypt/src/tfm.c | 11 +++++----
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c
index ec324308b..f4cc1e831 100644
--- a/wolfcrypt/src/ecc.c
+++ b/wolfcrypt/src/ecc.c
@@ -1876,6 +1876,27 @@ int wc_ecc_mulmod(mp_int* k, ecc_point *G, ecc_point *R, mp_int* a,
 
 #else /* ECC_TIMING_RESISTANT */
 
+
+#if defined(TFM_TIMINING_RESISTANT) && defined(USE_FAST_MATH)
+    /* let's use the one we already have */
+    extern const wolfssl_word wc_off_on_addr[2];
+#else
+    static const wolfssl_word wc_off_on_addr[2] =
+    {
+    #if defined(WC_64BIT_CPU)
+        W64LIT(0x0000000000000000),
+        W64LIT(0xffffffffffffffff)
+    #elif defined(WC_16BIT_CPU)
+        0x0000U,
+        0xffffU
+    #else
+        /* 32 bit */
+        0x00000000U,
+        0xffffffffU
+    #endif
+    };
+#endif
+
 /**
    Perform a point multiplication  (timing resistant)
    k    The scalar to multiply by
@@ -2013,8 +2034,42 @@ int wc_ecc_mulmod_ex(mp_int* k, ecc_point *G, ecc_point *R,
            if (err == MP_OKAY)
                err = ecc_projective_add_point(M[0], M[1], M[i^1], a, modulus,
                                                                        mp);
+            /* instead of using M[i] for double, which leaks key bit to cache
+             * monitor, use M[2] as temp, make sure address calc is constant,
+             * keep &M[0] and &M[1] in cache */
            if (err == MP_OKAY)
-               err = ecc_projective_dbl_point(M[i], M[i], a, modulus, mp);
+               err = mp_copy((mp_int*)
+                             ( ((wolfssl_word)&M[0]->x & wc_off_on_addr[i^1]) +
+                               ((wolfssl_word)&M[1]->x & wc_off_on_addr[i])),
+                             M[2]->x);
+           if (err == MP_OKAY)
+               err = mp_copy((mp_int*)
+                             ( ((wolfssl_word)&M[0]->y & wc_off_on_addr[i^1]) +
+                               ((wolfssl_word)&M[1]->y & wc_off_on_addr[i])),
+                             M[2]->y);
+           if (err == MP_OKAY)
+               err = mp_copy((mp_int*)
+                             ( ((wolfssl_word)&M[0]->z & wc_off_on_addr[i^1]) +
+                               ((wolfssl_word)&M[1]->z & wc_off_on_addr[i])),
+                             M[2]->z);
+           if (err == MP_OKAY)
+               err = ecc_projective_dbl_point(M[2], M[2], a, modulus, mp);
+           /* copy M[2] back to M[i] */
+           if (err == MP_OKAY)
+               err = mp_copy(M[2]->x,
+                             (mp_int*)
+                             ( ((wolfssl_word)&M[0]->x & wc_off_on_addr[i^1]) +
+                               ((wolfssl_word)&M[1]->x & wc_off_on_addr[i])) );
+           if (err == MP_OKAY)
+               err = mp_copy(M[2]->y,
+                             (mp_int*)
+                             ( ((wolfssl_word)&M[0]->y & wc_off_on_addr[i^1]) +
+                               ((wolfssl_word)&M[1]->y & wc_off_on_addr[i])) );
+           if (err == MP_OKAY)
+               err = mp_copy(M[2]->z,
+                             (mp_int*)
+                             ( ((wolfssl_word)&M[0]->z & wc_off_on_addr[i^1]) +
+                               ((wolfssl_word)&M[1]->z & wc_off_on_addr[i])) );
            if (err != MP_OKAY)
                break;
        } /* end for */
diff --git a/wolfcrypt/src/tfm.c b/wolfcrypt/src/tfm.c
index 06a4846e4..7c6a55518 100644
--- a/wolfcrypt/src/tfm.c
+++ b/wolfcrypt/src/tfm.c
@@ -1036,7 +1036,8 @@ int fp_addmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 #ifdef TFM_TIMING_RESISTANT
 
 /* all off / all on pointer addresses for constant calculations */
-static const wolfssl_word off_on_addr[2] =
+/* ecc.c uses same table */
+const wolfssl_word wc_off_on_addr[2] =
 {
 #if defined(WC_64BIT_CPU)
     W64LIT(0x0000000000000000),
@@ -1113,13 +1114,13 @@ static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
     /* instead of using R[y] for sqr, which leaks key bit to cache monitor,
      * use R[2] as temp, make sure address calc is constant, keep
      * &R[0] and &R[1] in cache */
-    fp_copy((fp_int*) ( ((wolfssl_word)&R[0] & off_on_addr[(y^1)]) +
-                        ((wolfssl_word)&R[1] & off_on_addr[y]) ),
+    fp_copy((fp_int*) ( ((wolfssl_word)&R[0] & wc_off_on_addr[y^1]) +
+                        ((wolfssl_word)&R[1] & wc_off_on_addr[y]) ),
             &R[2]);
     fp_sqr(&R[2], &R[2]);          fp_montgomery_reduce(&R[2], P, mp);
     fp_copy(&R[2],
-            (fp_int*) ( ((wolfssl_word)&R[0] & off_on_addr[(y^1)]) +
-                        ((wolfssl_word)&R[1] & off_on_addr[y]) ) );
+            (fp_int*) ( ((wolfssl_word)&R[0] & wc_off_on_addr[y^1]) +
+                        ((wolfssl_word)&R[1] & wc_off_on_addr[y]) ) );
   }
 
    fp_montgomery_reduce(&R[0], P, mp);