From f8aeac608c2a53e5476b16d11fe937dad6cd1ea8 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Fri, 30 Oct 2015 16:03:26 -0700 Subject: [PATCH 1/2] 1. Add C NI-intrinsic AES-GCM encrypt and decrypt. 2. Fix error string for wolfcrypt test of GMAC. 3. Add AES-GCM Decrypt to benchmark. --- configure.ac | 3 + wolfcrypt/benchmark/benchmark.c | 23 +- wolfcrypt/src/aes.c | 437 ++++++++++++++++++++++++++++++++ wolfcrypt/src/aes_asm.asm | 79 ++++++ wolfcrypt/src/aes_asm.s | 88 ++++++- wolfcrypt/test/test.c | 2 +- wolfssl/wolfcrypt/aes.h | 2 + 7 files changed, 630 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 8d4eba4f1..d7fdca0e8 100644 --- a/configure.ac +++ b/configure.ac @@ -452,6 +452,7 @@ then AM_CFLAGS="$AM_CFLAGS -maes -msse4" fi fi + AS_IF([test "x$ENABLED_AESGCM" != "xno"],[AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_AESGCM"]) fi if test "$ENABLED_INTELASM" = "yes" @@ -2500,6 +2501,7 @@ CREATE_HEX_VERSION AC_SUBST([AM_CPPFLAGS]) AC_SUBST([AM_CFLAGS]) AC_SUBST([AM_LDFLAGS]) +AC_SUBST([AM_CCASFLAGS]) AC_SUBST([LIB_ADD]) AC_SUBST([LIB_STATIC_ADD]) @@ -2619,6 +2621,7 @@ echo " * C Flags: $CFLAGS" echo " * C++ Compiler: $CXX" echo " * C++ Flags: $CXXFLAGS" echo " * CPP Flags: $CPPFLAGS" +echo " * CCAS Flags: $CCASFLAGS" echo " * LIB Flags: $LIB" echo " * Debug enabled: $ax_enable_debug" echo " * Warnings as failure: $ac_cv_warnings_as_errors" diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index fbcf360b2..3f709522c 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -483,7 +483,28 @@ void bench_aesgcm(void) persec = persec / 1024; #endif - printf("AES-GCM %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, + printf("AES-GCM Encrypt %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, + blockType, total, persec); + SHOW_INTEL_CYCLES + printf("\n"); + + start = current_time(1); + BEGIN_INTEL_CYCLES + + for(i = 0; i < numBlocks; i++) + wc_AesGcmDecrypt(&enc, plain, cipher, sizeof(cipher), iv, 12, + tag, 16, additional, 13); + + END_INTEL_CYCLES + total = current_time(0) - start; + + persec = 1 / total * numBlocks; +#ifdef BENCH_EMBEDDED + /* since using kB, convert to MB/s */ + persec = persec / 1024; +#endif + + printf("AES-GCM Decrypt %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, blockType, total, persec); SHOW_INTEL_CYCLES printf("\n"); diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 0550d6118..d7524b66a 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -2763,6 +2763,426 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) } +#ifdef WOLFSSL_AESNI + +void gfmul(__m128i a, __m128i b, __m128i* out) XASM_LINK("gfmul"); + + +/* See Intel® Carry-Less Multiplication Instruction + * and its Usage for Computing the GCM Mode White Paper + * by Shay Gueron, Intel Mobility Group, Israel Development Center; + * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research */ + + +/* Figure 9. AES-GCM – Encrypt With Single Block Ghash at a Time */ + +static void AES_GCM_encrypt(const unsigned char *in, + unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + unsigned char *tag, + int nbytes, int abytes, int ibytes, + const unsigned char* key, int nr) +{ + int i, j ,k; + __m128i tmp1, tmp2, tmp3, tmp4; + __m128i H, Y, T; + __m128i *KEY = (__m128i*)key; + __m128i ctr1, ctr2, ctr3, ctr4; + __m128i last_block = _mm_setzero_si128(); + __m128i ONE = _mm_set_epi32(0, 1, 0, 0); + __m128i FOUR = _mm_set_epi32(0, 4, 0, 0); + __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); + __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); + __m128i X = _mm_setzero_si128(); + + if(ibytes == 96/8) { + Y = _mm_loadu_si128((__m128i*)ivec); + Y = _mm_insert_epi32(Y, 0x1000000, 3); + /* (Compute E[ZERO, KS] and E[Y0, KS] together */ + tmp1 = _mm_xor_si128(X, KEY[0]); + tmp2 = _mm_xor_si128(Y, KEY[0]); + for(j=1; j < nr-1; j+=2) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); + } + tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); + H = _mm_aesenclast_si128(tmp1, KEY[nr]); + T = _mm_aesenclast_si128(tmp2, KEY[nr]); + H = _mm_shuffle_epi8(H, BSWAP_MASK); + } + else { + tmp1 = _mm_xor_si128(X, KEY[0]); + for(j=1; j key, aes->rounds); + return 0; + } +#endif + #ifdef WOLFSSL_PIC32MZ_CRYPT ctr = (char *)aes->iv_ce ; #else @@ -3339,6 +3767,15 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, WOLFSSL_ENTER("AesGcmDecrypt"); +#ifdef WOLFSSL_AESNI + if (haveAESNI) { + if (AES_GCM_decrypt(in, out, authIn, iv, authTag, + sz, authInSz, ivSz, (byte*)aes->key, aes->rounds) == 0) + return AES_GCM_AUTH_E; + return 0; + } +#endif + #ifdef WOLFSSL_PIC32MZ_CRYPT ctr = (char *)aes->iv_ce ; #else diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 1e3d2d99e..439dacc51 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -969,4 +969,83 @@ MAKE_RK256_b: pxor xmm3,xmm2 ret + +; See Intel® Carry-Less Multiplication Instruction +; and its Usage for Computing the GCM Mode White Paper +; by Shay Gueron, Intel Mobility Group, Israel Development Center; +; and Michael E. Kounavis, Intel Labs, Circuits and Systems Research + +; void gfmul(__m128i a, __m128i b, __m128i* out); + +; .globl gfmul +gfmul PROC + ; xmm0 holds operand a (128 bits) + ; xmm1 holds operand b (128 bits) + ; rdi holds the pointer to output (128 bits) + movdqa %xmm0, %xmm3 + pclmulqdq $0, %xmm1, %xmm3 ; xmm3 holds a0*b0 + movdqa %xmm0, %xmm4 + pclmulqdq $16, %xmm1, %xmm4 ; xmm4 holds a0*b1 + movdqa %xmm0, %xmm5 + pclmulqdq $1, %xmm1, %xmm5 ; xmm5 holds a1*b0 + movdqa %xmm0, %xmm6 + pclmulqdq $17, %xmm1, %xmm6 ; xmm6 holds a1*b1 + pxor %xmm5, %xmm4 ; xmm4 holds a0*b1 + a1*b0 + movdqa %xmm4, %xmm5 + psrldq $8, %xmm4 + pslldq $8, %xmm5 + pxor %xmm5, %xmm3 + pxor %xmm4, %xmm6 ; holds the result of + ; the carry-less multiplication of + ; xmm0 by xmm1 + +; shift the result by one bit position to the left cope for the fact +; that bits are reversed + movdqa %xmm3, %xmm7 + movdqa %xmm6, %xmm8 + pslld $1, %xmm3 + pslld $1, %xmm6 + psrld $31, %xmm7 + psrld $31, %xmm8 + movdqa %xmm7, %xmm9 + pslldq $4, %xmm8 + pslldq $4, %xmm7 + psrldq $12, %xmm9 + por %xmm7, %xmm3 + por %xmm8, %xmm6 + por %xmm9, %xmm6 + +; first phase of the reduction + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm8 + movdqa %xmm3, %xmm9 + pslld $31, %xmm7 ; packed right shifting << 31 + pslld $30, %xmm8 ; packed right shifting shift << 30 + pslld $25, %xmm9 ; packed right shifting shift << 25 + pxor %xmm8, %xmm7 ; xor the shifted versions + pxor %xmm9, %xmm7 + + movdqa %xmm7, %xmm8 + pslldq $12, %xmm7 + psrldq $4, %xmm8 + pxor %xmm7, %xmm3 ; first phase of the reduction complete + movdqa %xmm3,%xmm2 ; second phase of the reduction + movdqa %xmm3,%xmm4 + movdqa %xmm3,%xmm5 + psrld $1, %xmm2 ; packed left shifting >> 1 + psrld $2, %xmm4 ; packed left shifting >> 2 + psrld $7, %xmm5 ; packed left shifting >> 7 + + pxor %xmm4, %xmm2 ; xor the shifted versions + pxor %xmm5, %xmm2 + pxor %xmm8, %xmm2 + pxor %xmm2, %xmm3 + pxor %xmm3, %xmm6 ; the result is in xmm6 + movdqu %xmm6, (%rdi) ; store the result + + ; restore xmm6 and xmm7 + + ret +gfmul ENDP + END diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s index b50c7ff95..92d670416 100644 --- a/wolfcrypt/src/aes_asm.s +++ b/wolfcrypt/src/aes_asm.s @@ -20,12 +20,12 @@ */ +/* This file is in at&t asm syntax, see .asm for intel syntax */ + /* See Intel® Advanced Encryption Standard (AES) Instructions Set White Paper * by Intel Mobility Group, Israel Development Center, Israel Shay Gueron */ -/* This file is in at&t asm syntax, see .asm for intel syntax */ - /* AES_CBC_encrypt (const unsigned char *in, @@ -814,3 +814,87 @@ pxor %xmm4, %xmm3 pxor %xmm2, %xmm3 ret + +#ifdef HAVE_AESGCM + +/* See Intel® Carry-Less Multiplication Instruction + * and its Usage for Computing the GCM Mode White Paper + * by Shay Gueron, Intel Mobility Group, Israel Development Center; + * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research + * + * This is for use with the C code. + */ + +/* Figure 6. Code Sample - Performing Ghash Using Algorithms 1 and 5 */ + +/* + * void gfmul(__m128i a, __m128i b, __m128i* out); + */ +.globl gfmul +gfmul: + #xmm0 holds operand a (128 bits) + #xmm1 holds operand b (128 bits) + #rdi holds the pointer to output (128 bits) + movdqa %xmm0, %xmm3 + pclmulqdq $0, %xmm1, %xmm3 # xmm3 holds a0*b0 + movdqa %xmm0, %xmm4 + pclmulqdq $16, %xmm1, %xmm4 # xmm4 holds a0*b1 + movdqa %xmm0, %xmm5 + pclmulqdq $1, %xmm1, %xmm5 # xmm5 holds a1*b0 + movdqa %xmm0, %xmm6 + pclmulqdq $17, %xmm1, %xmm6 # xmm6 holds a1*b1 + pxor %xmm5, %xmm4 # xmm4 holds a0*b1 + a1*b0 + movdqa %xmm4, %xmm5 + psrldq $8, %xmm4 + pslldq $8, %xmm5 + pxor %xmm5, %xmm3 + pxor %xmm4, %xmm6 # holds the result of + # the carry-less multiplication of + # xmm0 by xmm1 + +# shift the result by one bit position to the left cope for the fact +# that bits are reversed + movdqa %xmm3, %xmm7 + movdqa %xmm6, %xmm8 + pslld $1, %xmm3 + pslld $1, %xmm6 + psrld $31, %xmm7 + psrld $31, %xmm8 + movdqa %xmm7, %xmm9 + pslldq $4, %xmm8 + pslldq $4, %xmm7 + psrldq $12, %xmm9 + por %xmm7, %xmm3 + por %xmm8, %xmm6 + por %xmm9, %xmm6 + +# first phase of the reduction + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm8 + movdqa %xmm3, %xmm9 + pslld $31, %xmm7 # packed right shifting << 31 + pslld $30, %xmm8 # packed right shifting shift << 30 + pslld $25, %xmm9 # packed right shifting shift << 25 + pxor %xmm8, %xmm7 # xor the shifted versions + pxor %xmm9, %xmm7 + + movdqa %xmm7, %xmm8 + pslldq $12, %xmm7 + psrldq $4, %xmm8 + pxor %xmm7, %xmm3 # first phase of the reduction complete + movdqa %xmm3,%xmm2 # second phase of the reduction + movdqa %xmm3,%xmm4 + movdqa %xmm3,%xmm5 + psrld $1, %xmm2 # packed left shifting >> 1 + psrld $2, %xmm4 # packed left shifting >> 2 + psrld $7, %xmm5 # packed left shifting >> 7 + + pxor %xmm4, %xmm2 # xor the shifted versions + pxor %xmm5, %xmm2 + pxor %xmm8, %xmm2 + pxor %xmm2, %xmm3 + pxor %xmm3, %xmm6 # the result is in xmm6 + movdqu %xmm6, (%rdi) # store the result + ret + +#endif /* HAVE_AESGCM */ diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index f804e6d9f..fbdc43a63 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -390,7 +390,7 @@ int wolfcrypt_test(void* args) #ifdef HAVE_AESGCM if ( (ret = gmac_test()) != 0) - return err_sys("GMAC test passed!\n", ret); + return err_sys("GMAC test failed!\n", ret); else printf( "GMAC test passed!\n"); #endif diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index 480412a21..f850c3ca8 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -46,6 +46,8 @@ #ifdef WOLFSSL_AESNI #include +#include +#include #if !defined (ALIGN16) #if defined (__GNUC__) From 28dcef2d71d453fbbc012861d58610dadb2ce55a Mon Sep 17 00:00:00 2001 From: toddouska Date: Mon, 2 Nov 2015 09:39:34 -0800 Subject: [PATCH 2/2] gcm benchmark results format alignment --- wolfcrypt/benchmark/benchmark.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 3f709522c..436c4cf7f 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -483,11 +483,12 @@ void bench_aesgcm(void) persec = persec / 1024; #endif - printf("AES-GCM Encrypt %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, + printf("AES-GCM %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, blockType, total, persec); SHOW_INTEL_CYCLES printf("\n"); +#if 0 start = current_time(1); BEGIN_INTEL_CYCLES @@ -508,6 +509,7 @@ void bench_aesgcm(void) blockType, total, persec); SHOW_INTEL_CYCLES printf("\n"); +#endif } #endif