From f8aeac608c2a53e5476b16d11fe937dad6cd1ea8 Mon Sep 17 00:00:00 2001
From: John Safranek <john@wolfssl.com>
Date: Fri, 30 Oct 2015 16:03:26 -0700
Subject: [PATCH 1/2] 1. Add C NI-intrinsic AES-GCM encrypt and decrypt. 2. Fix
 error string for wolfcrypt test of GMAC. 3. Add AES-GCM Decrypt to benchmark.

---
 configure.ac                    |   3 +
 wolfcrypt/benchmark/benchmark.c |  23 +-
 wolfcrypt/src/aes.c             | 437 ++++++++++++++++++++++++++++++++
 wolfcrypt/src/aes_asm.asm       |  79 ++++++
 wolfcrypt/src/aes_asm.s         |  88 ++++++-
 wolfcrypt/test/test.c           |   2 +-
 wolfssl/wolfcrypt/aes.h         |   2 +
 7 files changed, 630 insertions(+), 4 deletions(-)

diff --git a/configure.ac b/configure.ac
index 8d4eba4f1..d7fdca0e8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -452,6 +452,7 @@ then
             AM_CFLAGS="$AM_CFLAGS -maes -msse4"
         fi
     fi
+    AS_IF([test "x$ENABLED_AESGCM" != "xno"],[AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_AESGCM"])
 fi
 
 if test "$ENABLED_INTELASM" = "yes"
@@ -2500,6 +2501,7 @@ CREATE_HEX_VERSION
 AC_SUBST([AM_CPPFLAGS])
 AC_SUBST([AM_CFLAGS])
 AC_SUBST([AM_LDFLAGS])
+AC_SUBST([AM_CCASFLAGS])
 AC_SUBST([LIB_ADD])
 AC_SUBST([LIB_STATIC_ADD])
 
@@ -2619,6 +2621,7 @@ echo "   * C Flags:                   $CFLAGS"
 echo "   * C++ Compiler:              $CXX"
 echo "   * C++ Flags:                 $CXXFLAGS"
 echo "   * CPP Flags:                 $CPPFLAGS"
+echo "   * CCAS Flags:                $CCASFLAGS"
 echo "   * LIB Flags:                 $LIB"
 echo "   * Debug enabled:             $ax_enable_debug"
 echo "   * Warnings as failure:       $ac_cv_warnings_as_errors"
diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c
index fbcf360b2..3f709522c 100644
--- a/wolfcrypt/benchmark/benchmark.c
+++ b/wolfcrypt/benchmark/benchmark.c
@@ -483,7 +483,28 @@ void bench_aesgcm(void)
     persec = persec / 1024;
 #endif
 
-    printf("AES-GCM  %d %s took %5.3f seconds, %8.3f MB/s", numBlocks,
+    printf("AES-GCM Encrypt %d %s took %5.3f seconds, %8.3f MB/s", numBlocks,
+                                              blockType, total, persec);
+    SHOW_INTEL_CYCLES
+    printf("\n");
+
+    start = current_time(1);
+    BEGIN_INTEL_CYCLES
+
+    for(i = 0; i < numBlocks; i++)
+        wc_AesGcmDecrypt(&enc, plain, cipher, sizeof(cipher), iv, 12,
+                        tag, 16, additional, 13);
+
+    END_INTEL_CYCLES
+    total = current_time(0) - start;
+
+    persec = 1 / total * numBlocks;
+#ifdef BENCH_EMBEDDED
+    /* since using kB, convert to MB/s */
+    persec = persec / 1024;
+#endif
+
+    printf("AES-GCM Decrypt %d %s took %5.3f seconds, %8.3f MB/s", numBlocks,
                                               blockType, total, persec);
     SHOW_INTEL_CYCLES
     printf("\n");
diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c
index 0550d6118..d7524b66a 100644
--- a/wolfcrypt/src/aes.c
+++ b/wolfcrypt/src/aes.c
@@ -2763,6 +2763,426 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
 }
 
 
+#ifdef WOLFSSL_AESNI
+
+void gfmul(__m128i a, __m128i b, __m128i* out) XASM_LINK("gfmul");
+
+
+/* See Intel® Carry-Less Multiplication Instruction
+ * and its Usage for Computing the GCM Mode White Paper
+ * by Shay Gueron, Intel Mobility Group, Israel Development Center;
+ * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research */
+
+
+/* Figure 9. AES-GCM – Encrypt With Single Block Ghash at a Time */
+
+static void AES_GCM_encrypt(const unsigned char *in,
+                            unsigned char *out,
+                            const unsigned char* addt,
+                            const unsigned char* ivec,
+                            unsigned char *tag,
+                            int nbytes, int abytes, int ibytes,
+                            const unsigned char* key, int nr)
+{
+    int i, j ,k;
+    __m128i tmp1, tmp2, tmp3, tmp4;
+    __m128i H, Y, T;
+    __m128i *KEY = (__m128i*)key;
+    __m128i ctr1, ctr2, ctr3, ctr4;
+    __m128i last_block = _mm_setzero_si128();
+    __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
+    __m128i FOUR = _mm_set_epi32(0, 4, 0, 0);
+    __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
+    __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+    __m128i X = _mm_setzero_si128();
+
+    if(ibytes == 96/8) {
+        Y = _mm_loadu_si128((__m128i*)ivec);
+        Y = _mm_insert_epi32(Y, 0x1000000, 3);
+            /* (Compute E[ZERO, KS] and E[Y0, KS] together */
+        tmp1 = _mm_xor_si128(X, KEY[0]);
+        tmp2 = _mm_xor_si128(Y, KEY[0]);
+        for(j=1; j < nr-1; j+=2) {
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+            tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+            tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
+        }
+        tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+        tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
+        H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+        T = _mm_aesenclast_si128(tmp2, KEY[nr]);
+        H = _mm_shuffle_epi8(H, BSWAP_MASK);
+    }
+    else {
+        tmp1 = _mm_xor_si128(X, KEY[0]);
+        for(j=1; j <nr; j++)
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+        H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+        H = _mm_shuffle_epi8(H, BSWAP_MASK);
+        Y = _mm_setzero_si128();
+        for(i=0; i < ibytes/16; i++) {
+            tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
+            tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+            Y = _mm_xor_si128(Y, tmp1);
+            gfmul(Y, H, &Y);
+        }
+        if(ibytes%16) {
+            for(j=0; j < ibytes%16; j++)
+                ((unsigned char*)&last_block)[j] = ivec[i*16+j];
+            tmp1 = last_block;
+            tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+            Y = _mm_xor_si128(Y, tmp1);
+            gfmul(Y, H, &Y);
+        }
+        tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0);
+        tmp1 = _mm_insert_epi64(tmp1, 0, 1);
+        Y = _mm_xor_si128(Y, tmp1);
+        gfmul(Y, H, &Y);
+        Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */
+        tmp1 = _mm_xor_si128(Y, KEY[0]);
+        for(j=1; j < nr; j++)
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+        T = _mm_aesenclast_si128(tmp1, KEY[nr]);
+    }
+
+    for(i=0; i<abytes/16; i++){
+        tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        X = _mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+    }
+    if(abytes%16){
+        last_block = _mm_setzero_si128();
+        for(j=0; j<abytes%16; j++)
+            ((unsigned char*)&last_block)[j] = addt[i*16+j];
+        tmp1 = last_block;
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        X = _mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+    }
+
+    ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
+    ctr1 = _mm_add_epi32(ctr1, ONE);
+    ctr2 = _mm_add_epi32(ctr1, ONE);
+    ctr3 = _mm_add_epi32(ctr2, ONE);
+    ctr4 = _mm_add_epi32(ctr3, ONE);
+
+    for(i=0; i < nbytes/16/4; i++){
+        tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+        tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
+        tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
+        tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
+        ctr1 = _mm_add_epi32(ctr1, FOUR);
+        ctr2 = _mm_add_epi32(ctr2, FOUR);
+        ctr3 = _mm_add_epi32(ctr3, FOUR);
+        ctr4 = _mm_add_epi32(ctr4, FOUR);
+        tmp1 =_mm_xor_si128(tmp1, KEY[0]);
+        tmp2 =_mm_xor_si128(tmp2, KEY[0]);
+        tmp3 =_mm_xor_si128(tmp3, KEY[0]);
+        tmp4 =_mm_xor_si128(tmp4, KEY[0]);
+        for(j=1; j < nr-1; j+=2){
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+            tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+            tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
+            tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+            tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
+            tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]);
+            tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]);
+        }
+        tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+        tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
+        tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]);
+        tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]);
+        tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
+        tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
+        tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
+        tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
+        tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0]));
+        tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1]));
+        tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2]));
+        tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3]));
+        _mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1);
+        _mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2);
+        _mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3);
+        _mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4);
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
+        tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
+        tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
+        X = _mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+        X = _mm_xor_si128(X, tmp2);
+        gfmul(X, H, &X);
+        X = _mm_xor_si128(X, tmp3);
+        gfmul(X, H, &X);
+        X = _mm_xor_si128(X, tmp4);
+        gfmul(X, H, &X);
+    }
+    for(k = i*4; k < nbytes/16; k++){
+        tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+        ctr1 = _mm_add_epi32(ctr1, ONE);
+        tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+        for(j=1; j<nr-1; j+=2){
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+        }
+        tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+        tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+        tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
+        _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        X =_mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+    }
+    /* If one partial block remains */
+    if(nbytes%16){
+        tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+        tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+        for(j=1; j<nr-1; j+=2){
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+        }
+        tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+        tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+        tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
+        last_block = tmp1;
+        for(j=0; j < nbytes%16; j++)
+            out[k*16+j]=((unsigned char*)&last_block)[j];
+        for(; j<16; j++)
+            ((unsigned char*)&last_block)[j]=0;
+        tmp1 = last_block;
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        X =_mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+    }
+    tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0);
+    tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1);
+    X = _mm_xor_si128(X, tmp1);
+    gfmul(X, H, &X);
+    X = _mm_shuffle_epi8(X, BSWAP_MASK);
+    T = _mm_xor_si128(X, T);
+    _mm_storeu_si128((__m128i*)tag, T);
+}
+
+
+/* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */
+
+static int AES_GCM_decrypt(const unsigned char *in,
+                           unsigned char *out,
+                           const unsigned char* addt,
+                           const unsigned char* ivec,
+                           const unsigned char *tag, int nbytes, int abytes,
+                           int ibytes, const unsigned char* key, int nr)
+{
+    int i, j ,k;
+    __m128i tmp1, tmp2, tmp3, tmp4;
+    __m128i H, Y, T;
+    __m128i *KEY = (__m128i*)key;
+    __m128i ctr1, ctr2, ctr3, ctr4;
+    __m128i last_block = _mm_setzero_si128();
+    __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
+    __m128i FOUR = _mm_set_epi32(0, 4, 0, 0);
+    __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
+    __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+    __m128i X = _mm_setzero_si128();
+
+    if (ibytes == 96/8) {
+        Y = _mm_loadu_si128((__m128i*)ivec);
+        Y = _mm_insert_epi32(Y, 0x1000000, 3);
+            /* (Compute E[ZERO, KS] and E[Y0, KS] together */
+        tmp1 = _mm_xor_si128(X, KEY[0]);
+        tmp2 = _mm_xor_si128(Y, KEY[0]);
+        for (j = 1; j < nr - 1; j += 2) {
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+            tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+            tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
+        }
+        tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+        tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
+        H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+        T = _mm_aesenclast_si128(tmp2, KEY[nr]);
+        H = _mm_shuffle_epi8(H, BSWAP_MASK);
+    }
+    else {
+        tmp1 = _mm_xor_si128(X, KEY[0]);
+        for (j = 1; j < nr; j++)
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+        H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+        H = _mm_shuffle_epi8(H, BSWAP_MASK);
+        Y = _mm_setzero_si128();
+
+        for (i = 0; i < ibytes / 16; i++) {
+            tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
+            tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+            Y = _mm_xor_si128(Y, tmp1);
+            gfmul(Y, H, &Y);
+        }
+
+        if (ibytes % 16) {
+            for(j = 0; j < ibytes % 16; j++)
+                ((unsigned char*)&last_block)[j] = ivec[i*16+j];
+            tmp1 = last_block;
+            tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+            Y = _mm_xor_si128(Y, tmp1);
+            gfmul(Y, H, &Y);
+        }
+
+        tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0);
+        tmp1 = _mm_insert_epi64(tmp1, 0, 1);
+        Y = _mm_xor_si128(Y, tmp1);
+        gfmul(Y, H, &Y);
+        Y = _mm_shuffle_epi8(Y, BSWAP_MASK);
+        /* Compute E(K, Y0) */
+        tmp1 = _mm_xor_si128(Y, KEY[0]);
+        for(j=1; j < nr; j++)
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+        T = _mm_aesenclast_si128(tmp1, KEY[nr]);
+    }
+
+    for (i = 0; i < abytes / 16; i++) {
+        tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        X = _mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+    }
+
+    if (abytes % 16) {
+        last_block = _mm_setzero_si128();
+        for (j = 0;j < abytes % 16; j++)
+            ((unsigned char*)&last_block)[j] = addt[i*16+j];
+        tmp1 = last_block;
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        X =_mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+    }
+
+    for (i = 0; i < nbytes / 16; i++) {
+        tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]);
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        X = _mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+    }
+
+    if (nbytes % 16) {
+        last_block = _mm_setzero_si128();
+        for(j = 0; j < nbytes % 16; j++)
+            ((unsigned char*)&last_block)[j] = in[i*16+j];
+        tmp1 = last_block;
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        X = _mm_xor_si128(X, tmp1);
+        gfmul(X, H, &X);
+    }
+
+    tmp1 = _mm_insert_epi64(tmp1, nbytes * 8, 0);
+    tmp1 = _mm_insert_epi64(tmp1, abytes * 8, 1);
+    X = _mm_xor_si128(X, tmp1);
+    gfmul(X, H, &X);
+    X = _mm_shuffle_epi8(X, BSWAP_MASK);
+    T = _mm_xor_si128(X, T);
+
+    if (0xffff !=
+           _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag))))
+        return 0; /* in case the authentication failed */
+
+    ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
+    ctr1 = _mm_add_epi32(ctr1, ONE);
+    ctr2 = _mm_add_epi32(ctr1, ONE);
+    ctr3 = _mm_add_epi32(ctr2, ONE);
+    ctr4 = _mm_add_epi32(ctr3, ONE);
+
+    for (i=0; i < nbytes/16/4; i++) {
+        tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+        tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
+        tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
+        tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
+
+        ctr1 = _mm_add_epi32(ctr1, FOUR);
+        ctr2 = _mm_add_epi32(ctr2, FOUR);
+        ctr3 = _mm_add_epi32(ctr3, FOUR);
+        ctr4 = _mm_add_epi32(ctr4, FOUR);
+
+        tmp1 =_mm_xor_si128(tmp1, KEY[0]);
+        tmp2 =_mm_xor_si128(tmp2, KEY[0]);
+        tmp3 =_mm_xor_si128(tmp3, KEY[0]);
+        tmp4 =_mm_xor_si128(tmp4, KEY[0]);
+
+        for (j = 1; j < nr - 1; j += 2) {
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+            tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+            tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
+            tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
+
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+            tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
+            tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]);
+            tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]);
+        }
+
+        tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+        tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
+        tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]);
+        tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]);
+
+        tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
+        tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
+        tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
+        tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
+
+        tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0]));
+        tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1]));
+        tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2]));
+        tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3]));
+
+        _mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1);
+        _mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2);
+        _mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3);
+        _mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4);
+
+        tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+        tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
+        tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
+        tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
+    }
+
+    for (k = i*4; k < nbytes/16; k++) {
+        tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+        ctr1 = _mm_add_epi32(ctr1, ONE);
+        tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+        for (j = 1; j < nr-1; j += 2) {
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+            tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+        }
+        tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+        tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+        tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
+        _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
+    }
+
+    /* If one partial block remains */
+    if (nbytes % 16) {
+        tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+        tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+        for (j = 1; j < nr-1; j += 2) {
+            tmp1 =_mm_aesenc_si128(tmp1, KEY[j]);
+            tmp1 =_mm_aesenc_si128(tmp1, KEY[j+1]);
+        }
+        tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+        tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+        tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
+        last_block = tmp1;
+        for (j = 0; j < nbytes % 16; j++)
+            out[k*16+j]=((unsigned char*)&last_block)[j];
+    }
+
+    return 1; /* when sucessful returns 1 */
+}
+
+#endif /* WOLFSSL_AESNI */
+
+
 #if defined(GCM_SMALL)
 
 static void GMULT(byte* X, byte* Y)
@@ -3281,6 +3701,14 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
 
     WOLFSSL_ENTER("AesGcmEncrypt");
 
+#ifdef WOLFSSL_AESNI
+    if (haveAESNI) {
+        AES_GCM_encrypt((void*)in, out, (void*)authIn, (void*)iv, authTag,
+                    sz, authInSz, ivSz, (byte*)aes->key, aes->rounds);
+        return 0;
+    }
+#endif
+
 #ifdef WOLFSSL_PIC32MZ_CRYPT
     ctr = (char *)aes->iv_ce ;
 #else
@@ -3339,6 +3767,15 @@ int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
 
     WOLFSSL_ENTER("AesGcmDecrypt");
 
+#ifdef WOLFSSL_AESNI
+    if (haveAESNI) {
+        if (AES_GCM_decrypt(in, out, authIn, iv, authTag,
+                        sz, authInSz, ivSz, (byte*)aes->key, aes->rounds) == 0)
+            return AES_GCM_AUTH_E;
+        return 0;
+    }
+#endif
+
 #ifdef WOLFSSL_PIC32MZ_CRYPT
     ctr = (char *)aes->iv_ce ;
 #else
diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm
index 1e3d2d99e..439dacc51 100644
--- a/wolfcrypt/src/aes_asm.asm
+++ b/wolfcrypt/src/aes_asm.asm
@@ -969,4 +969,83 @@ MAKE_RK256_b:
 	pxor	xmm3,xmm2
 	ret
 
+
+; See Intel® Carry-Less Multiplication Instruction
+; and its Usage for Computing the GCM Mode White Paper
+; by Shay Gueron, Intel Mobility Group, Israel Development Center;
+; and Michael E. Kounavis, Intel Labs, Circuits and Systems Research
+
+; void gfmul(__m128i a, __m128i b, __m128i* out);
+
+; .globl gfmul
+gfmul PROC
+        ; xmm0 holds operand a (128 bits)
+        ; xmm1 holds operand b (128 bits)
+        ; rdi  holds the pointer to output (128 bits)
+        movdqa     %xmm0, %xmm3
+        pclmulqdq  $0, %xmm1, %xmm3     ; xmm3 holds a0*b0
+        movdqa     %xmm0, %xmm4
+        pclmulqdq  $16, %xmm1, %xmm4    ; xmm4 holds a0*b1
+        movdqa     %xmm0, %xmm5
+        pclmulqdq  $1, %xmm1, %xmm5     ; xmm5 holds a1*b0
+        movdqa     %xmm0, %xmm6
+        pclmulqdq  $17, %xmm1, %xmm6    ; xmm6 holds a1*b1
+        pxor       %xmm5, %xmm4         ; xmm4 holds a0*b1 + a1*b0
+        movdqa     %xmm4, %xmm5
+        psrldq     $8, %xmm4
+        pslldq     $8, %xmm5
+        pxor       %xmm5, %xmm3
+        pxor       %xmm4, %xmm6         ; <xmm6:xmm3> holds the result of
+                                        ; the carry-less multiplication of
+                                        ; xmm0 by xmm1
+
+; shift the result by one bit position to the left cope for the fact
+; that bits are reversed
+        movdqa   %xmm3, %xmm7
+        movdqa   %xmm6, %xmm8
+        pslld    $1, %xmm3
+        pslld    $1, %xmm6
+        psrld    $31, %xmm7
+        psrld    $31, %xmm8
+        movdqa   %xmm7, %xmm9
+        pslldq   $4, %xmm8
+        pslldq   $4, %xmm7
+        psrldq   $12, %xmm9
+        por      %xmm7, %xmm3
+        por      %xmm8, %xmm6
+        por      %xmm9, %xmm6
+
+; first phase of the reduction
+        movdqa   %xmm3, %xmm7
+        movdqa   %xmm3, %xmm8
+        movdqa   %xmm3, %xmm9
+        pslld    $31, %xmm7             ; packed right shifting << 31
+        pslld    $30, %xmm8             ; packed right shifting shift << 30
+        pslld    $25, %xmm9             ; packed right shifting shift << 25
+        pxor     %xmm8, %xmm7           ; xor the shifted versions
+        pxor     %xmm9, %xmm7
+
+        movdqa   %xmm7, %xmm8
+        pslldq   $12, %xmm7
+        psrldq   $4, %xmm8
+        pxor     %xmm7, %xmm3           ; first phase of the reduction complete
+        movdqa   %xmm3,%xmm2            ; second phase of the reduction
+        movdqa   %xmm3,%xmm4
+        movdqa   %xmm3,%xmm5
+        psrld    $1, %xmm2              ; packed left shifting >> 1
+        psrld    $2, %xmm4              ; packed left shifting >> 2
+        psrld    $7, %xmm5              ; packed left shifting >> 7
+
+        pxor     %xmm4, %xmm2           ; xor the shifted versions
+        pxor     %xmm5, %xmm2
+        pxor     %xmm8, %xmm2
+        pxor     %xmm2, %xmm3
+        pxor     %xmm3, %xmm6           ; the result is in xmm6
+        movdqu   %xmm6, (%rdi)          ; store the result
+
+        ; restore xmm6 and xmm7
+
+        ret
+gfmul ENDP
+
 END
diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s
index b50c7ff95..92d670416 100644
--- a/wolfcrypt/src/aes_asm.s
+++ b/wolfcrypt/src/aes_asm.s
@@ -20,12 +20,12 @@
  */
 
 
+/* This file is in at&t asm syntax, see .asm for intel syntax */
+
 /* See Intel® Advanced Encryption Standard (AES) Instructions Set White Paper
  * by Intel Mobility Group, Israel Development Center, Israel Shay Gueron
  */
 
-/* This file is in at&t asm syntax, see .asm for intel syntax */
-
 
 /*
 AES_CBC_encrypt (const unsigned char *in,
@@ -814,3 +814,87 @@ pxor  %xmm4, %xmm3
 pxor   %xmm2, %xmm3
 ret
 
+
+#ifdef HAVE_AESGCM
+
+/* See Intel® Carry-Less Multiplication Instruction
+ * and its Usage for Computing the GCM Mode White Paper
+ * by Shay Gueron, Intel Mobility Group, Israel Development Center;
+ * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research
+ *
+ * This is for use with the C code.
+ */
+
+/* Figure 6. Code Sample - Performing Ghash Using Algorithms 1 and 5 */
+
+/*
+ * void gfmul(__m128i a, __m128i b, __m128i* out);
+ */
+.globl gfmul
+gfmul:
+        #xmm0 holds operand a (128 bits)
+        #xmm1 holds operand b (128 bits)
+        #rdi  holds the pointer to output (128 bits)
+        movdqa     %xmm0, %xmm3
+        pclmulqdq  $0, %xmm1, %xmm3     # xmm3 holds a0*b0
+        movdqa     %xmm0, %xmm4
+        pclmulqdq  $16, %xmm1, %xmm4    # xmm4 holds a0*b1
+        movdqa     %xmm0, %xmm5
+        pclmulqdq  $1, %xmm1, %xmm5     # xmm5 holds a1*b0
+        movdqa     %xmm0, %xmm6
+        pclmulqdq  $17, %xmm1, %xmm6    # xmm6 holds a1*b1
+        pxor       %xmm5, %xmm4         # xmm4 holds a0*b1 + a1*b0
+        movdqa     %xmm4, %xmm5
+        psrldq     $8, %xmm4
+        pslldq     $8, %xmm5
+        pxor       %xmm5, %xmm3
+        pxor       %xmm4, %xmm6         # <xmm6:xmm3> holds the result of
+                                        # the carry-less multiplication of
+                                        # xmm0 by xmm1
+
+# shift the result by one bit position to the left cope for the fact
+# that bits are reversed
+        movdqa   %xmm3, %xmm7
+        movdqa   %xmm6, %xmm8
+        pslld    $1, %xmm3
+        pslld    $1, %xmm6
+        psrld    $31, %xmm7
+        psrld    $31, %xmm8
+        movdqa   %xmm7, %xmm9
+        pslldq   $4, %xmm8
+        pslldq   $4, %xmm7
+        psrldq   $12, %xmm9
+        por      %xmm7, %xmm3
+        por      %xmm8, %xmm6
+        por      %xmm9, %xmm6
+
+# first phase of the reduction
+        movdqa   %xmm3, %xmm7
+        movdqa   %xmm3, %xmm8
+        movdqa   %xmm3, %xmm9
+        pslld    $31, %xmm7             # packed right shifting << 31
+        pslld    $30, %xmm8             # packed right shifting shift << 30
+        pslld    $25, %xmm9             # packed right shifting shift << 25
+        pxor     %xmm8, %xmm7           # xor the shifted versions
+        pxor     %xmm9, %xmm7
+
+        movdqa   %xmm7, %xmm8
+        pslldq   $12, %xmm7
+        psrldq   $4, %xmm8
+        pxor     %xmm7, %xmm3           # first phase of the reduction complete
+        movdqa   %xmm3,%xmm2            # second phase of the reduction
+        movdqa   %xmm3,%xmm4
+        movdqa   %xmm3,%xmm5
+        psrld    $1, %xmm2              # packed left shifting >> 1
+        psrld    $2, %xmm4              # packed left shifting >> 2
+        psrld    $7, %xmm5              # packed left shifting >> 7
+
+        pxor     %xmm4, %xmm2           # xor the shifted versions
+        pxor     %xmm5, %xmm2
+        pxor     %xmm8, %xmm2
+        pxor     %xmm2, %xmm3
+        pxor     %xmm3, %xmm6           # the result is in xmm6
+        movdqu   %xmm6, (%rdi)          # store the result
+        ret
+
+#endif /* HAVE_AESGCM */
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index f804e6d9f..fbdc43a63 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -390,7 +390,7 @@ int wolfcrypt_test(void* args)
 
 #ifdef HAVE_AESGCM
     if ( (ret = gmac_test()) != 0)
-        return err_sys("GMAC     test passed!\n", ret);
+        return err_sys("GMAC     test failed!\n", ret);
     else
         printf( "GMAC     test passed!\n");
 #endif
diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h
index 480412a21..f850c3ca8 100644
--- a/wolfssl/wolfcrypt/aes.h
+++ b/wolfssl/wolfcrypt/aes.h
@@ -46,6 +46,8 @@
 #ifdef WOLFSSL_AESNI
 
 #include <wmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
 
 #if !defined (ALIGN16)
     #if defined (__GNUC__)

From 28dcef2d71d453fbbc012861d58610dadb2ce55a Mon Sep 17 00:00:00 2001
From: toddouska <todd@wolfssl.com>
Date: Mon, 2 Nov 2015 09:39:34 -0800
Subject: [PATCH 2/2] gcm benchmark results format alignment

---
 wolfcrypt/benchmark/benchmark.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c
index 3f709522c..436c4cf7f 100644
--- a/wolfcrypt/benchmark/benchmark.c
+++ b/wolfcrypt/benchmark/benchmark.c
@@ -483,11 +483,12 @@ void bench_aesgcm(void)
     persec = persec / 1024;
 #endif
 
-    printf("AES-GCM Encrypt %d %s took %5.3f seconds, %8.3f MB/s", numBlocks,
+    printf("AES-GCM  %d %s took %5.3f seconds, %8.3f MB/s", numBlocks,
                                               blockType, total, persec);
     SHOW_INTEL_CYCLES
     printf("\n");
 
+#if 0
     start = current_time(1);
     BEGIN_INTEL_CYCLES
 
@@ -508,6 +509,7 @@ void bench_aesgcm(void)
                                               blockType, total, persec);
     SHOW_INTEL_CYCLES
     printf("\n");
+#endif
 }
 #endif