diff --git a/configure.ac b/configure.ac index 86a244908..ae96431f4 100644 --- a/configure.ac +++ b/configure.ac @@ -789,6 +789,11 @@ AC_ARG_ENABLE([intelasm], [ ENABLED_INTELASM=no ] ) +if test "$ENABLED_AESNI" = "small" +then + AM_CFLAGS="$AM_CFLAGS -DAES_GCM_AESNI_NO_UNROLL" + ENABLED_AESNI=yes +fi if test "$ENABLED_AESNI" = "yes" || test "$ENABLED_INTELASM" = "yes" then @@ -799,7 +804,7 @@ then # opt levels greater than 2 may cause problems on systems w/o aesni if test "$CC" != "icc" then - AM_CFLAGS="$AM_CFLAGS -maes -msse4" + AM_CFLAGS="$AM_CFLAGS -maes -msse4 -mpclmul" fi fi AS_IF([test "x$ENABLED_AESGCM" != "xno"],[AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_AESGCM"]) diff --git a/src/include.am b/src/include.am index 1e2d11c6b..7620d725d 100644 --- a/src/include.am +++ b/src/include.am @@ -61,7 +61,8 @@ endif src_libwolfssl_la_SOURCES += \ wolfcrypt/src/hmac.c \ - wolfcrypt/src/hash.c + wolfcrypt/src/hash.c \ + wolfcrypt/src/cpuid.c if BUILD_RNG src_libwolfssl_la_SOURCES += wolfcrypt/src/random.c diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 68681f549..5084aff58 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -137,8 +137,8 @@ #define BEGIN_INTEL_CYCLES total_cycles = get_intel_cycles(); #define END_INTEL_CYCLES total_cycles = get_intel_cycles() - total_cycles; #define SHOW_INTEL_CYCLES printf(" Cycles per byte = %6.2f", \ - count == 0 ? 0 : \ - (float)total_cycles / (count*BENCH_SIZE)); + count == 0 ? 0 : \ + (float)total_cycles / ((word64)count*BENCH_SIZE)); #elif defined(LINUX_CYCLE_COUNT) #include #include @@ -579,7 +579,7 @@ static void bench_stats_sym_finish(const char* desc, int doAsync, int count, dou persec = (1 / total) * blocks; } - printf("%-8s%s %5.0f %s took %5.3f seconds, %8.3f %s/s", + printf("%-12s%s %5.0f %s took %5.3f seconds, %8.3f %s/s", desc, BENCH_ASYNC_GET_NAME(doAsync), blocks, blockType, total, persec, blockType); SHOW_INTEL_CYCLES @@ -1275,7 +1275,31 @@ void bench_aesgcm(int doAsync) count += times; } while (bench_stats_sym_check(start)); exit_aes_gcm: - bench_stats_sym_finish("AES-GCM", doAsync, count, start); + bench_stats_sym_finish("AES-GCM-Enc", doAsync, count, start); + + /* GCM uses same routine in backend for both encrypt and decrypt */ + bench_stats_start(&count, &start); + do { + for (times = 0; times < numBlocks || BENCH_ASYNC_IS_PEND(); ) { + bench_async_poll(); + + /* while free pending slots in queue, submit ops */ + for (i = 0; i < BENCH_MAX_PENDING; i++) { + if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, ×, numBlocks)) { + ret = wc_AesGcmDecrypt(&enc[i], bench_plain, + bench_cipher, BENCH_SIZE, + bench_iv, 12, bench_tag, AES_AUTH_TAG_SZ, + bench_additional, AES_AUTH_ADD_SZ); + if (!bench_async_handle(&ret, BENCH_ASYNC_GET_DEV(&enc[i]), 0, ×)) { + goto exit_aes_gcm_dec; + } + } + } /* for i */ + } /* for times */ + count += times; + } while (bench_stats_sym_check(start)); +exit_aes_gcm_dec: + bench_stats_sym_finish("AES-GCM-Dec", doAsync, count, start); exit: diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index a9bc7b2c4..04d5d0318 100755 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -29,6 +29,7 @@ #ifndef NO_AES #include +#include /* fips wrapper calls, user can call direct */ @@ -614,30 +615,14 @@ #endif #ifndef _MSC_VER - #define cpuid(reg, func)\ - __asm__ __volatile__ ("cpuid":\ - "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ - "a" (func)); - #define XASM_LINK(f) asm(f) #else - - #include - #define cpuid(a,b) __cpuid((int*)a,b) - #define XASM_LINK(f) #endif /* _MSC_VER */ - static int Check_CPU_support_AES(void) { - unsigned int reg[4]; /* put a,b,c,d into 0,1,2,3 */ - cpuid(reg, 1); /* query info 1 */ - - if (reg[2] & 0x2000000) - return 1; - - return 0; + return IS_INTEL_AESNI(cpuid_get_flags()) != 0; } static int checkAESNI = 0; @@ -3538,8 +3523,206 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) #ifdef WOLFSSL_AESNI -void gfmul(__m128i a, __m128i b, __m128i* out) XASM_LINK("gfmul"); +#if defined(USE_INTEL_SPEEDUP) + #define HAVE_INTEL_AVX1 + #define HAVE_INTEL_AVX2 +#endif /* USE_INTEL_SPEEDUP */ +static const __m128i MOD2_128 = { 0x1, 0xc200000000000000UL }; + +static __m128i gfmul_sw(__m128i a, __m128i b) +{ + __m128i r, t1, t2, t3, t4, t5, t6, t7; +#ifndef WOLFSSL_AES_GCM_SLOW_CLMUL + /* 128 x 128 Carryless Multiply */ + t3 = _mm_clmulepi64_si128(a, b, 0x10); + t2 = _mm_clmulepi64_si128(a, b, 0x01); + t1 = _mm_clmulepi64_si128(a, b, 0x00); + t4 = _mm_clmulepi64_si128(a, b, 0x11); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_slli_si128(t3, 8); + t3 = _mm_srli_si128(t3, 8); + t1 = _mm_xor_si128(t1, t2); + t4 = _mm_xor_si128(t4, t3); + + /* shift left 1 bit - bits reversed */ + t5 = _mm_srli_epi32(t1, 31); + t6 = _mm_srli_epi32(t4, 31); + t1 = _mm_slli_epi32(t1, 1); + t4 = _mm_slli_epi32(t4, 1); + t7 = _mm_srli_si128(t5, 12); + t5 = _mm_slli_si128(t5, 4); + t6 = _mm_slli_si128(t6, 4); + t4 = _mm_or_si128(t4, t7); + t1 = _mm_or_si128(t1, t5); + t4 = _mm_or_si128(t4, t6); + + /* Reduction */ + t2 = _mm_clmulepi64_si128(t1, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t1, 78); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t3, 78); + t3 = _mm_xor_si128(t3, t2); + r = _mm_xor_si128(t4, t3); +#else + t2 = _mm_shuffle_epi32(b, 78); + t3 = _mm_shuffle_epi32(a, 78); + t2 = _mm_xor_si128(t2, b); + t3 = _mm_xor_si128(t3, a); + t4 = _mm_clmulepi64_si128(b, a, 0x11); + t1 = _mm_clmulepi64_si128(b, a, 0x00); + t2 = _mm_clmulepi64_si128(t2, t3, 0x00); + t2 = _mm_xor_si128(t2, t1); + t2 = _mm_xor_si128(t2, t4); + t3 = _mm_slli_si128(t2, 8); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + t4 = _mm_xor_si128(t4, t2); + + t5 = _mm_srli_epi32(t1, 31); + t6 = _mm_srli_epi32(t4, 31); + t1 = _mm_slli_epi32(t1, 1); + t4 = _mm_slli_epi32(t4, 1); + t7 = _mm_srli_si128(t5, 12); + t5 = _mm_slli_si128(t5, 4); + t6 = _mm_slli_si128(t6, 4); + t4 = _mm_or_si128(t4, t7); + t1 = _mm_or_si128(t1, t5); + t4 = _mm_or_si128(t4, t6); + + t5 = _mm_slli_epi32(t1, 31); + t6 = _mm_slli_epi32(t1, 30); + t7 = _mm_slli_epi32(t1, 25); + t5 = _mm_xor_si128(t5, t6); + t5 = _mm_xor_si128(t5, t7); + + t6 = _mm_srli_si128(t5, 4); + t5 = _mm_slli_si128(t5, 12); + t1 = _mm_xor_si128(t1, t5); + t7 = _mm_srli_epi32(t1, 1); + t3 = _mm_srli_epi32(t1, 2); + t2 = _mm_srli_epi32(t1, 7); + + t7 = _mm_xor_si128(t7, t3); + t7 = _mm_xor_si128(t7, t2); + t7 = _mm_xor_si128(t7, t6); + t7 = _mm_xor_si128(t7, t1); + r = _mm_xor_si128(t4, t7); +#endif + + return r; +} + +static void gfmul_only(__m128i a, __m128i b, __m128i* r0, __m128i* r1) +{ + __m128i t1, t2, t3, t4; + + /* 128 x 128 Carryless Multiply */ +#ifndef WOLFSSL_AES_GCM_SLOW_CLMUL + t3 = _mm_clmulepi64_si128(a, b, 0x10); + t2 = _mm_clmulepi64_si128(a, b, 0x01); + t1 = _mm_clmulepi64_si128(a, b, 0x00); + t4 = _mm_clmulepi64_si128(a, b, 0x11); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_slli_si128(t3, 8); + t3 = _mm_srli_si128(t3, 8); + t1 = _mm_xor_si128(t1, t2); + t4 = _mm_xor_si128(t4, t3); +#else + t2 = _mm_shuffle_epi32(b, 78); + t3 = _mm_shuffle_epi32(a, 78); + t2 = _mm_xor_si128(t2, b); + t3 = _mm_xor_si128(t3, a); + t4 = _mm_clmulepi64_si128(b, a, 0x11); + t1 = _mm_clmulepi64_si128(b, a, 0x00); + t2 = _mm_clmulepi64_si128(t2, t3, 0x00); + t2 = _mm_xor_si128(t2, t1); + t2 = _mm_xor_si128(t2, t4); + t3 = _mm_slli_si128(t2, 8); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + t4 = _mm_xor_si128(t4, t2); +#endif + *r0 = _mm_xor_si128(t1, *r0); + *r1 = _mm_xor_si128(t4, *r1); +} + +static __m128i gfmul_shl1(__m128i a) +{ + __m128i t1 = a, t2; + t2 = _mm_srli_epi64(t1, 63); + t1 = _mm_slli_epi64(t1, 1); + t2 = _mm_slli_si128(t2, 8); + t1 = _mm_or_si128(t1, t2); + /* if (a[1] >> 63) t1 = _mm_xor_si128(t1, MOD2_128); */ + a = _mm_shuffle_epi32(a, 0xff); + a = _mm_srai_epi32(a, 31); + a = _mm_and_si128(a, MOD2_128); + t1 = _mm_xor_si128(t1, a); + return t1; +} + +static __m128i ghash_red(__m128i r0, __m128i r1) +{ + __m128i t2, t3; +#ifndef WOLFSSL_AES_GCM_SLOW_CLMUL + t2 = _mm_clmulepi64_si128(r0, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(r0, 78); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t3, 78); + t3 = _mm_xor_si128(t3, t2); + return _mm_xor_si128(r1, t3); +#else + __m128i t5, t6, t7; + + t5 = _mm_slli_epi32(r0, 31); + t6 = _mm_slli_epi32(r0, 30); + t7 = _mm_slli_epi32(r0, 25); + t5 = _mm_xor_si128(t5, t6); + t5 = _mm_xor_si128(t5, t7); + + t6 = _mm_srli_si128(t5, 4); + t5 = _mm_slli_si128(t5, 12); + r0 = _mm_xor_si128(r0, t5); + t7 = _mm_srli_epi32(r0, 1); + t3 = _mm_srli_epi32(r0, 2); + t2 = _mm_srli_epi32(r0, 7); + + t7 = _mm_xor_si128(t7, t3); + t7 = _mm_xor_si128(t7, t2); + t7 = _mm_xor_si128(t7, t6); + t7 = _mm_xor_si128(t7, r0); + return _mm_xor_si128(r1, t7); +#endif +} + +static __m128i gfmul_shifted(__m128i a, __m128i b) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only(a, b, &t0, &t1); + return ghash_red(t0, t1); +} + +#ifndef AES_GCM_AESNI_NO_UNROLL +static __m128i gfmul8(__m128i a1, __m128i a2, __m128i a3, __m128i a4, + __m128i a5, __m128i a6, __m128i a7, __m128i a8, + __m128i b1, __m128i b2, __m128i b3, __m128i b4, + __m128i b5, __m128i b6, __m128i b7, __m128i b8) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only(a1, b8, &t0, &t1); + gfmul_only(a2, b7, &t0, &t1); + gfmul_only(a3, b6, &t0, &t1); + gfmul_only(a4, b5, &t0, &t1); + gfmul_only(a5, b4, &t0, &t1); + gfmul_only(a6, b3, &t0, &t1); + gfmul_only(a7, b2, &t0, &t1); + gfmul_only(a8, b1, &t0, &t1); + return ghash_red(t0, t1); +} +#endif /* See Intel® Carry-Less Multiplication Instruction * and its Usage for Computing the GCM Mode White Paper @@ -3549,317 +3732,2100 @@ void gfmul(__m128i a, __m128i b, __m128i* out) XASM_LINK("gfmul"); /* Figure 9. AES-GCM – Encrypt With Single Block Ghash at a Time */ -static void AES_GCM_encrypt(const unsigned char *in, - unsigned char *out, +static const __m128i ONE = { 0x0, 0x1 }; +#ifndef AES_GCM_AESNI_NO_UNROLL +static const __m128i TWO = { 0x0, 0x2 }; +static const __m128i THREE = { 0x0, 0x3 }; +static const __m128i FOUR = { 0x0, 0x4 }; +static const __m128i FIVE = { 0x0, 0x5 }; +static const __m128i SIX = { 0x0, 0x6 }; +static const __m128i SEVEN = { 0x0, 0x7 }; +static const __m128i EIGHT = { 0x0, 0x8 }; +#endif +static const __m128i BSWAP_EPI64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f }; +static const __m128i BSWAP_MASK = { 0x08090a0b0c0d0e0f, 0x0001020304050607 }; + +static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, - unsigned char *tag, - int nbytes, int abytes, int ibytes, + unsigned char *tag, unsigned int nbytes, + unsigned int abytes, unsigned int ibytes, const unsigned char* key, int nr) { int i, j ,k; - __m128i tmp1, tmp2, tmp3, tmp4; + __m128i ctr1; __m128i H, Y, T; - __m128i *KEY = (__m128i*)key; - __m128i ctr1, ctr2, ctr3, ctr4; - __m128i last_block = _mm_setzero_si128(); - __m128i ONE = _mm_set_epi32(0, 1, 0, 0); - __m128i FOUR = _mm_set_epi32(0, 4, 0, 0); - __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); - __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m128i X = _mm_setzero_si128(); + __m128i *KEY = (__m128i*)key, lastKey; + __m128i last_block = _mm_setzero_si128(); + __m128i tmp1, tmp2; +#ifndef AES_GCM_AESNI_NO_UNROLL + __m128i HT[8]; + __m128i r0, r1; + __m128i XV; + __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; +#endif - if(ibytes == 96/8) { + if (ibytes == 12) { Y = _mm_setzero_si128(); - for(j=0; j < ibytes%16; j++) + for (j=0; j < 12; j++) ((unsigned char*)&Y)[j] = ivec[j]; Y = _mm_insert_epi32(Y, 0x1000000, 3); /* (Compute E[ZERO, KS] and E[Y0, KS] together */ tmp1 = _mm_xor_si128(X, KEY[0]); tmp2 = _mm_xor_si128(Y, KEY[0]); - for(j=1; j < nr-1; j+=2) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + lastKey = KEY[14]; + } } - tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); - H = _mm_aesenclast_si128(tmp1, KEY[nr]); - T = _mm_aesenclast_si128(tmp2, KEY[nr]); + H = _mm_aesenclast_si128(tmp1, lastKey); + T = _mm_aesenclast_si128(tmp2, lastKey); H = _mm_shuffle_epi8(H, BSWAP_MASK); } else { + if (ibytes % 16) { + i = ibytes / 16; + for (j=0; j < (int)(ibytes%16); j++) + ((unsigned char*)&last_block)[j] = ivec[i*16+j]; + } tmp1 = _mm_xor_si128(X, KEY[0]); - for(j=1; j 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + H = _mm_aesenclast_si128(tmp1, lastKey); H = _mm_shuffle_epi8(H, BSWAP_MASK); Y = _mm_setzero_si128(); - for(i=0; i < ibytes/16; i++) { + for (i=0; i < (int)(ibytes/16); i++) { tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); - gfmul(Y, H, &Y); + Y = gfmul_sw(Y, H); } - if(ibytes%16) { - for(j=0; j < ibytes%16; j++) - ((unsigned char*)&last_block)[j] = ivec[i*16+j]; + if (ibytes % 16) { tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); - gfmul(Y, H, &Y); + Y = gfmul_sw(Y, H); } tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); tmp1 = _mm_insert_epi64(tmp1, 0, 1); Y = _mm_xor_si128(Y, tmp1); - gfmul(Y, H, &Y); + Y = gfmul_sw(Y, H); Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ tmp1 = _mm_xor_si128(Y, KEY[0]); - for(j=1; j < nr; j++) - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); - T = _mm_aesenclast_si128(tmp1, KEY[nr]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + T = _mm_aesenclast_si128(tmp1, lastKey); } - for(i=0; i= 16*8) { + HT[0] = H; + HT[1] = gfmul_shifted(H, H); + HT[2] = gfmul_shifted(H, HT[1]); + HT[3] = gfmul_shifted(HT[1], HT[1]); + HT[4] = gfmul_shifted(HT[1], HT[2]); + HT[5] = gfmul_shifted(HT[2], HT[2]); + HT[6] = gfmul_shifted(HT[2], HT[3]); + HT[7] = gfmul_shifted(HT[3], HT[3]); - for(i=0; i < nbytes/16/4; i++){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); - tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); - tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, FOUR); - ctr2 = _mm_add_epi32(ctr2, FOUR); - ctr3 = _mm_add_epi32(ctr3, FOUR); - ctr4 = _mm_add_epi32(ctr4, FOUR); + tmp2 = _mm_add_epi32(ctr1, ONE); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); + tmp3 = _mm_add_epi32(ctr1, TWO); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); + tmp4 = _mm_add_epi32(ctr1, THREE); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); + tmp5 = _mm_add_epi32(ctr1, FOUR); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); + tmp6 = _mm_add_epi32(ctr1, FIVE); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); + tmp7 = _mm_add_epi32(ctr1, SIX); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); + tmp8 = _mm_add_epi32(ctr1, SEVEN); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, EIGHT); tmp1 =_mm_xor_si128(tmp1, KEY[0]); tmp2 =_mm_xor_si128(tmp2, KEY[0]); tmp3 =_mm_xor_si128(tmp3, KEY[0]); tmp4 =_mm_xor_si128(tmp4, KEY[0]); - for(j=1; j < nr-1; j+=2){ - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]); + tmp5 =_mm_xor_si128(tmp5, KEY[0]); + tmp6 =_mm_xor_si128(tmp6, KEY[0]); + tmp7 =_mm_xor_si128(tmp7, KEY[0]); + tmp8 =_mm_xor_si128(tmp8, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); + lastKey = KEY[14]; + } } - tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]); - tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); - tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); - tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); - tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0])); - tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1])); - tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2])); - tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3])); - _mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1); - _mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2); - _mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3); - _mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4); + tmp1 =_mm_aesenclast_si128(tmp1, lastKey); + tmp2 =_mm_aesenclast_si128(tmp2, lastKey); + tmp3 =_mm_aesenclast_si128(tmp3, lastKey); + tmp4 =_mm_aesenclast_si128(tmp4, lastKey); + tmp5 =_mm_aesenclast_si128(tmp5, lastKey); + tmp6 =_mm_aesenclast_si128(tmp6, lastKey); + tmp7 =_mm_aesenclast_si128(tmp7, lastKey); + tmp8 =_mm_aesenclast_si128(tmp8, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[0])); + tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[1])); + tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[2])); + tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[3])); + tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[4])); + tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[5])); + tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[6])); + tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[7])); + _mm_storeu_si128(&((__m128i*)out)[0], tmp1); + _mm_storeu_si128(&((__m128i*)out)[1], tmp2); + _mm_storeu_si128(&((__m128i*)out)[2], tmp3); + _mm_storeu_si128(&((__m128i*)out)[3], tmp4); + _mm_storeu_si128(&((__m128i*)out)[4], tmp5); + _mm_storeu_si128(&((__m128i*)out)[5], tmp6); + _mm_storeu_si128(&((__m128i*)out)[6], tmp7); + _mm_storeu_si128(&((__m128i*)out)[7], tmp8); + + for (i=1; i < (int)(nbytes/16/8); i++) { + r0 = _mm_setzero_si128(); + r1 = _mm_setzero_si128(); + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp2 = _mm_add_epi32(ctr1, ONE); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); + tmp3 = _mm_add_epi32(ctr1, TWO); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); + tmp4 = _mm_add_epi32(ctr1, THREE); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); + tmp5 = _mm_add_epi32(ctr1, FOUR); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); + tmp6 = _mm_add_epi32(ctr1, FIVE); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); + tmp7 = _mm_add_epi32(ctr1, SIX); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); + tmp8 = _mm_add_epi32(ctr1, SEVEN); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, EIGHT); + tmp1 =_mm_xor_si128(tmp1, KEY[0]); + tmp2 =_mm_xor_si128(tmp2, KEY[0]); + tmp3 =_mm_xor_si128(tmp3, KEY[0]); + tmp4 =_mm_xor_si128(tmp4, KEY[0]); + tmp5 =_mm_xor_si128(tmp5, KEY[0]); + tmp6 =_mm_xor_si128(tmp6, KEY[0]); + tmp7 =_mm_xor_si128(tmp7, KEY[0]); + tmp8 =_mm_xor_si128(tmp8, KEY[0]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+0]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + gfmul_only(XV, HT[7], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+1]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[6], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+2]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[5], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+3]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[4], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+4]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[3], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+5]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[2], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+6]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[1], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+7]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[0], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); + /* Reduction */ + X = ghash_red(r0, r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 =_mm_aesenclast_si128(tmp1, lastKey); + tmp2 =_mm_aesenclast_si128(tmp2, lastKey); + tmp3 =_mm_aesenclast_si128(tmp3, lastKey); + tmp4 =_mm_aesenclast_si128(tmp4, lastKey); + tmp5 =_mm_aesenclast_si128(tmp5, lastKey); + tmp6 =_mm_aesenclast_si128(tmp6, lastKey); + tmp7 =_mm_aesenclast_si128(tmp7, lastKey); + tmp8 =_mm_aesenclast_si128(tmp8, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); + tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); + tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); + tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); + tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); + tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); + tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); + tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); + _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); + _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); + _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); + _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); + _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); + _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); + _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); + _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); + } + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); - X = _mm_xor_si128(X, tmp1); - gfmul(X, H, &X); - X = _mm_xor_si128(X, tmp2); - gfmul(X, H, &X); - X = _mm_xor_si128(X, tmp3); - gfmul(X, H, &X); - X = _mm_xor_si128(X, tmp4); - gfmul(X, H, &X); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); + tmp1 = _mm_xor_si128(X, tmp1); + X = gfmul8(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, + HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); } - for(k = i*4; k < nbytes/16; k++){ + for (k = i*8; k < (int)(nbytes/16); k++) { tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, ONE); tmp1 = _mm_xor_si128(tmp1, KEY[0]); - for(j=1; j 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } } - tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); - tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); _mm_storeu_si128(&((__m128i*)out)[k], tmp1); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X =_mm_xor_si128(X, tmp1); - gfmul(X, H, &X); + X = gfmul_shifted(X, H); } +#else + for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + } + for (; k < (int)(nbytes/16); k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + X = gfmul_shifted(X, H); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + } + if (k > 0) { + X = gfmul_shifted(X, H); + } +#endif /* If one partial block remains */ - if(nbytes%16){ + if (nbytes % 16) { tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tmp1 = _mm_xor_si128(tmp1, KEY[0]); - for(j=1; j 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } } - tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); - tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); - for(j=0; j < nbytes%16; j++) - ((unsigned char*)&last_block)[j]= in[k*16+j]; + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; tmp1 = _mm_xor_si128(tmp1, last_block); last_block = tmp1; - for(j=0; j < nbytes%16; j++) - out[k*16+j]=((unsigned char*)&last_block)[j]; - for(; j<16; j++) - ((unsigned char*)&last_block)[j]=0; + for (j=0; j < (int)(nbytes%16); j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X =_mm_xor_si128(X, tmp1); - gfmul(X, H, &X); + X = gfmul_shifted(X, H); } tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); X = _mm_xor_si128(X, tmp1); - gfmul(X, H, &X); + X = gfmul_shifted(X, H); X = _mm_shuffle_epi8(X, BSWAP_MASK); T = _mm_xor_si128(X, T); _mm_storeu_si128((__m128i*)tag, T); } - -#ifdef HAVE_AES_DECRYPT -/* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */ - -static int AES_GCM_decrypt(const unsigned char *in, - unsigned char *out, - const unsigned char* addt, - const unsigned char* ivec, - const unsigned char *tag, int nbytes, int abytes, - int ibytes, const unsigned char* key, int nr) +#ifdef HAVE_INTEL_AVX2 +static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + unsigned char *tag, unsigned int nbytes, + unsigned int abytes, unsigned int ibytes, + const unsigned char* key, int nr) { int i, j ,k; - __m128i tmp1, tmp2, tmp3, tmp4; + __m128i ctr1; __m128i H, Y, T; - __m128i *KEY = (__m128i*)key; - __m128i ctr1, ctr2, ctr3, ctr4; - __m128i last_block = _mm_setzero_si128(); - __m128i ONE = _mm_set_epi32(0, 1, 0, 0); - __m128i FOUR = _mm_set_epi32(0, 4, 0, 0); - __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); - __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m128i X = _mm_setzero_si128(); + __m128i *KEY = (__m128i*)key, lastKey; + __m128i last_block = _mm_setzero_si128(); +#ifndef AES_GCM_AESNI_NO_UNROLL + __m128i HT[8]; + register __m128i tmp1 asm("xmm4"); + register __m128i tmp2 asm("xmm5"); + register __m128i tmp3 asm("xmm6"); + register __m128i tmp4 asm("xmm7"); + register __m128i tmp5 asm("xmm8"); + register __m128i tmp6 asm("xmm9"); + register __m128i tmp7 asm("xmm10"); + register __m128i tmp8 asm("xmm11"); + __m128i pctr1[1]; + register __m128i XV asm("xmm2"); +#else + __m128i tmp1, tmp2; +#endif - if (ibytes == 96/8) { + if (ibytes == 12) { Y = _mm_setzero_si128(); - for(j=0; j < ibytes%16; j++) + for (j=0; j < 12; j++) ((unsigned char*)&Y)[j] = ivec[j]; Y = _mm_insert_epi32(Y, 0x1000000, 3); /* (Compute E[ZERO, KS] and E[Y0, KS] together */ tmp1 = _mm_xor_si128(X, KEY[0]); tmp2 = _mm_xor_si128(Y, KEY[0]); - for (j = 1; j < nr - 1; j += 2) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + lastKey = KEY[14]; + } } - tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); - H = _mm_aesenclast_si128(tmp1, KEY[nr]); - T = _mm_aesenclast_si128(tmp2, KEY[nr]); + H = _mm_aesenclast_si128(tmp1, lastKey); + T = _mm_aesenclast_si128(tmp2, lastKey); H = _mm_shuffle_epi8(H, BSWAP_MASK); } else { + if (ibytes % 16) { + i = ibytes / 16; + for (j=0; j < (int)(ibytes%16); j++) + ((unsigned char*)&last_block)[j] = ivec[i*16+j]; + } tmp1 = _mm_xor_si128(X, KEY[0]); - for (j = 1; j < nr; j++) - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); - H = _mm_aesenclast_si128(tmp1, KEY[nr]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + H = _mm_aesenclast_si128(tmp1, lastKey); H = _mm_shuffle_epi8(H, BSWAP_MASK); Y = _mm_setzero_si128(); - - for (i = 0; i < ibytes / 16; i++) { + for (i=0; i < (int)(ibytes/16); i++) { tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); - gfmul(Y, H, &Y); + Y = gfmul_sw(Y, H); } - if (ibytes % 16) { - for(j = 0; j < ibytes % 16; j++) - ((unsigned char*)&last_block)[j] = ivec[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); - gfmul(Y, H, &Y); + Y = gfmul_sw(Y, H); } - tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); tmp1 = _mm_insert_epi64(tmp1, 0, 1); Y = _mm_xor_si128(Y, tmp1); - gfmul(Y, H, &Y); - Y = _mm_shuffle_epi8(Y, BSWAP_MASK); - /* Compute E(K, Y0) */ + Y = gfmul_sw(Y, H); + Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ tmp1 = _mm_xor_si128(Y, KEY[0]); - for(j=1; j < nr; j++) - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); - T = _mm_aesenclast_si128(tmp1, KEY[nr]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + T = _mm_aesenclast_si128(tmp1, lastKey); } - for (i = 0; i < abytes / 16; i++) { + for (i=0; i < (int)(abytes/16); i++) { tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); - gfmul(X, H, &X); + X = gfmul_sw(X, H); } - - if (abytes % 16) { + if (abytes%16) { last_block = _mm_setzero_si128(); - for (j = 0;j < abytes % 16; j++) + for (j=0; j < (int)(abytes%16); j++) ((unsigned char*)&last_block)[j] = addt[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X =_mm_xor_si128(X, tmp1); - gfmul(X, H, &X); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw(X, H); } - for (i = 0; i < nbytes / 16; i++) { - tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]); + tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); + ctr1 = _mm_add_epi32(tmp1, ONE); + H = gfmul_shl1(H); + +#ifndef AES_GCM_AESNI_NO_UNROLL + i = 0; + if (nbytes >= 16*8) { + HT[0] = H; + HT[1] = gfmul_shifted(H, H); + HT[2] = gfmul_shifted(H, HT[1]); + HT[3] = gfmul_shifted(HT[1], HT[1]); + HT[4] = gfmul_shifted(HT[1], HT[2]); + HT[5] = gfmul_shifted(HT[2], HT[2]); + HT[6] = gfmul_shifted(HT[2], HT[3]); + HT[7] = gfmul_shifted(HT[3], HT[3]); + + pctr1[0] = ctr1; + __asm__ __volatile__ ( + "vmovaps (%[pctr1]), %%xmm0\n\t" + "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" + "vpshufb %%xmm1, %%xmm0, %0\n\t" + "vpaddd %[ONE], %%xmm0, %1\n\t" + "vpshufb %%xmm1, %1, %1\n\t" + "vpaddd %[TWO], %%xmm0, %2\n\t" + "vpshufb %%xmm1, %2, %2\n\t" + "vpaddd %[THREE], %%xmm0, %3\n\t" + "vpshufb %%xmm1, %3, %3\n\t" + "vpaddd %[FOUR], %%xmm0, %4\n\t" + "vpshufb %%xmm1, %4, %4\n\t" + "vpaddd %[FIVE], %%xmm0, %5\n\t" + "vpshufb %%xmm1, %5, %5\n\t" + "vpaddd %[SIX], %%xmm0, %6\n\t" + "vpshufb %%xmm1, %6, %6\n\t" + "vpaddd %[SEVEN], %%xmm0, %7\n\t" + "vpshufb %%xmm1, %7, %7\n\t" + "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" + + "vmovaps (%[KEY]), %%xmm1\n\t" + "vmovaps %%xmm0, (%[pctr1])\n\t" + "vpxor %%xmm1, %0, %0\n\t" + "vpxor %%xmm1, %1, %1\n\t" + "vpxor %%xmm1, %2, %2\n\t" + "vpxor %%xmm1, %3, %3\n\t" + "vpxor %%xmm1, %4, %4\n\t" + "vpxor %%xmm1, %5, %5\n\t" + "vpxor %%xmm1, %6, %6\n\t" + "vpxor %%xmm1, %7, %7\n\t" + + "vmovaps 16(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 32(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 48(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 64(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 80(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 96(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 112(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 128(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 144(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl L_enc128_enclast\n\t" + + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "vmovaps 176(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl L_enc128_enclast\n\t" + + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "vmovaps 208(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "vmovaps 224(%[KEY]), %%xmm12\n\t" + "\n" + "L_enc128_enclast:\n\t" + "vaesenclast %%xmm12, %0, %0\n\t" + "vaesenclast %%xmm12, %1, %1\n\t" + "vpxor (%[in]), %0, %0\n\t" + "vpxor 16(%[in]), %1, %1\n\t" + "vmovdqu %0, (%[out])\n\t" + "vmovdqu %1, 16(%[out])\n\t" + "vaesenclast %%xmm12, %2, %2\n\t" + "vaesenclast %%xmm12, %3, %3\n\t" + "vpxor 32(%[in]), %2, %2\n\t" + "vpxor 48(%[in]), %3, %3\n\t" + "vmovdqu %2, 32(%[out])\n\t" + "vmovdqu %3, 48(%[out])\n\t" + "vaesenclast %%xmm12, %4, %4\n\t" + "vaesenclast %%xmm12, %5, %5\n\t" + "vpxor 64(%[in]), %4, %4\n\t" + "vpxor 80(%[in]), %5, %5\n\t" + "vmovdqu %4, 64(%[out])\n\t" + "vmovdqu %5, 80(%[out])\n\t" + "vaesenclast %%xmm12, %6, %6\n\t" + "vaesenclast %%xmm12, %7, %7\n\t" + "vpxor 96(%[in]), %6, %6\n\t" + "vpxor 112(%[in]), %7, %7\n\t" + "vmovdqu %6, 96(%[out])\n\t" + "vmovdqu %7, 112(%[out])\n\t" + + : "=xr" (tmp1), "=xr" (tmp2), "=xr" (tmp3), "=xr" (tmp4), + "=xr" (tmp5), "=xr" (tmp6), "=xr" (tmp7), "=xr" (tmp8) + : [KEY] "r" (KEY), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_EPI64] "xrm" (BSWAP_EPI64), + [ONE] "xrm" (ONE), [TWO] "xrm" (TWO), + [THREE] "xrm" (THREE), [FOUR] "xrm" (FOUR), + [FIVE] "xrm" (FIVE), [SIX] "xrm" (SIX), + [SEVEN] "xrm" (SEVEN), [EIGHT] "xrm" (EIGHT) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + + XV = X; + for (i=1; i < (int)(nbytes/16/8); i++) { + __asm__ __volatile__ ( + "vmovaps (%[pctr1]), %%xmm0\n\t" + "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" + "vpshufb %%xmm1, %%xmm0, %0\n\t" + "vpaddd %[ONE], %%xmm0, %1\n\t" + "vpshufb %%xmm1, %1, %1\n\t" + "vpaddd %[TWO], %%xmm0, %2\n\t" + "vpshufb %%xmm1, %2, %2\n\t" + "vpaddd %[THREE], %%xmm0, %3\n\t" + "vpshufb %%xmm1, %3, %3\n\t" + "vpaddd %[FOUR], %%xmm0, %4\n\t" + "vpshufb %%xmm1, %4, %4\n\t" + "vpaddd %[FIVE], %%xmm0, %5\n\t" + "vpshufb %%xmm1, %5, %5\n\t" + "vpaddd %[SIX], %%xmm0, %6\n\t" + "vpshufb %%xmm1, %6, %6\n\t" + "vpaddd %[SEVEN], %%xmm0, %7\n\t" + "vpshufb %%xmm1, %7, %7\n\t" + "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" + + "vmovaps (%[KEY]), %%xmm1\n\t" + "vmovaps %%xmm0, (%[pctr1])\n\t" + "vpxor %%xmm1, %0, %0\n\t" + "vpxor %%xmm1, %1, %1\n\t" + "vpxor %%xmm1, %2, %2\n\t" + "vpxor %%xmm1, %3, %3\n\t" + "vpxor %%xmm1, %4, %4\n\t" + "vpxor %%xmm1, %5, %5\n\t" + "vpxor %%xmm1, %6, %6\n\t" + "vpxor %%xmm1, %7, %7\n\t" + + "vmovaps 16(%[KEY]), %%xmm12\n\t" + "vmovdqu -128(%[out]), %%xmm1\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovaps 112(%[HT]), %%xmm0\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vpxor %[XV], %%xmm1, %%xmm1\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 32(%[KEY]), %%xmm12\n\t" + "vmovdqu -112(%[out]), %%xmm1\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovaps 96(%[HT]), %%xmm0\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm14\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 48(%[KEY]), %%xmm12\n\t" + "vmovdqu -96(%[out]), %%xmm1\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovaps 80(%[HT]), %%xmm0\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm14\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 64(%[KEY]), %%xmm12\n\t" + "vmovdqu -80(%[out]), %%xmm1\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovaps 64(%[HT]), %%xmm0\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm14\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 80(%[KEY]), %%xmm12\n\t" + "vmovdqu -64(%[out]), %%xmm1\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovaps 48(%[HT]), %%xmm0\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm14\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 96(%[KEY]), %%xmm12\n\t" + "vmovdqu -48(%[out]), %%xmm1\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovaps 32(%[HT]), %%xmm0\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm14\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 112(%[KEY]), %%xmm12\n\t" + "vmovdqu -32(%[out]), %%xmm1\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovaps 16(%[HT]), %%xmm0\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm14\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 128(%[KEY]), %%xmm12\n\t" + "vmovdqu -16(%[out]), %%xmm1\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovaps (%[HT]), %%xmm0\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm14\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + + "vmovaps 144(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vmovdqa %%xmm13, %%xmm2\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "vmovaps 176(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "vmovaps 208(%[KEY]), %%xmm12\n\t" + "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %7, %7\n\t" + "vmovaps 224(%[KEY]), %%xmm12\n\t" + + "%=:\n\t" + "vaesenclast %%xmm12, %0, %0\n\t" + "vaesenclast %%xmm12, %1, %1\n\t" + "vpxor (%[in]), %0, %0\n\t" + "vpxor 16(%[in]), %1, %1\n\t" + "vmovdqu %0, (%[out])\n\t" + "vmovdqu %1, 16(%[out])\n\t" + "vaesenclast %%xmm12, %2, %2\n\t" + "vaesenclast %%xmm12, %3, %3\n\t" + "vpxor 32(%[in]), %2, %2\n\t" + "vpxor 48(%[in]), %3, %3\n\t" + "vmovdqu %2, 32(%[out])\n\t" + "vmovdqu %3, 48(%[out])\n\t" + "vaesenclast %%xmm12, %4, %4\n\t" + "vaesenclast %%xmm12, %5, %5\n\t" + "vpxor 64(%[in]), %4, %4\n\t" + "vpxor 80(%[in]), %5, %5\n\t" + "vmovdqu %4, 64(%[out])\n\t" + "vmovdqu %5, 80(%[out])\n\t" + "vaesenclast %%xmm12, %6, %6\n\t" + "vaesenclast %%xmm12, %7, %7\n\t" + "vpxor 96(%[in]), %6, %6\n\t" + "vpxor 112(%[in]), %7, %7\n\t" + "vmovdqu %6, 96(%[out])\n\t" + "vmovdqu %7, 112(%[out])\n\t" + + : "=xr" (tmp1), "=xr" (tmp2), "=xr" (tmp3), "=xr" (tmp4), + "=xr" (tmp5), "=xr" (tmp6), "=xr" (tmp7), "=xr" (tmp8), + [XV] "+xr" (XV) + : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_MASK] "xrm" (BSWAP_MASK), + [BSWAP_EPI64] "xrm" (BSWAP_EPI64), + [ONE] "xrm" (ONE), [TWO] "xrm" (TWO), + [THREE] "xrm" (THREE), [FOUR] "xrm" (FOUR), + [FIVE] "xrm" (FIVE), [SIX] "xrm" (SIX), + [SEVEN] "xrm" (SEVEN), [EIGHT] "xrm" (EIGHT), + [MOD2_128] "xrm" (MOD2_128) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + } + X = XV; + ctr1 = pctr1[0]; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); + tmp1 = _mm_xor_si128(X, tmp1); + X = gfmul8(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, + HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); + } + for (k = i*8; k < (int)(nbytes/16); k++) { + __asm__ __volatile__ ( + "vpshufb %[BSWAP_EPI64], %[ctr1], %0\n\t" + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" + "vpxor (%[KEY]), %0, %0\n\t" + "vaesenc 16(%[KEY]), %0, %0\n\t" + "vaesenc 32(%[KEY]), %0, %0\n\t" + "vaesenc 48(%[KEY]), %0, %0\n\t" + "vaesenc 64(%[KEY]), %0, %0\n\t" + "vaesenc 80(%[KEY]), %0, %0\n\t" + "vaesenc 96(%[KEY]), %0, %0\n\t" + "vaesenc 112(%[KEY]), %0, %0\n\t" + "vaesenc 128(%[KEY]), %0, %0\n\t" + "vaesenc 144(%[KEY]), %0, %0\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %1\n\t" + "jl %=f\n\t" + "vaesenc %1, %0, %0\n\t" + "vaesenc 176(%[KEY]), %0, %0\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %1\n\t" + "jl %=f\n\t" + "vaesenc %1, %0, %0\n\t" + "vaesenc 208(%[KEY]), %0, %0\n\t" + "vmovaps 224(%[KEY]), %1\n\t" + "%=:\n\t" + "vaesenclast %1, %0, %0\n\t" + "vpxor (%[in]), %0, %0\n\t" + "vmovdqu %0, (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %0, %0\n\t" + + "vpxor %0, %[X], %[X]\n\t" + "# Carryless Multiply X by H (128 x 128)\n\t" + "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" + "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" + "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" + "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "# Reduce\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "# End Reduce\n\t" + + : "+xr" (tmp1), "=xr" (tmp2), [H] "+xr" (H), [X] "+xr" (X), + [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "xrm" (BSWAP_MASK), + [BSWAP_EPI64] "xrm" (BSWAP_EPI64), + [ONE] "xrm" (ONE), + [MOD2_128] "xrm" (MOD2_128) + : "xmm15", "xmm14", "xmm13", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } +#else + for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { + __asm__ __volatile__ ( + "vpshufb %[BSWAP_EPI64], %[ctr1], %0\n\t" + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" + "vpxor (%[KEY]), %0, %0\n\t" + "vaesenc 16(%[KEY]), %0, %0\n\t" + "vaesenc 32(%[KEY]), %0, %0\n\t" + "vaesenc 48(%[KEY]), %0, %0\n\t" + "vaesenc 64(%[KEY]), %0, %0\n\t" + "vaesenc 80(%[KEY]), %0, %0\n\t" + "vaesenc 96(%[KEY]), %0, %0\n\t" + "vaesenc 112(%[KEY]), %0, %0\n\t" + "vaesenc 128(%[KEY]), %0, %0\n\t" + "vaesenc 144(%[KEY]), %0, %0\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %1\n\t" + "jl %=f\n\t" + "vaesenc %1, %0, %0\n\t" + "vaesenc 176(%[KEY]), %0, %0\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %1\n\t" + "jl %=f\n\t" + "vaesenc %1, %0, %0\n\t" + "vaesenc 208(%[KEY]), %0, %0\n\t" + "vmovaps 224(%[KEY]), %1\n\t" + "%=:\n\t" + "vaesenclast %1, %0, %0\n\t" + "vpxor (%[in]), %0, %0\n\t" + "vmovdqu %0, (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %0, %0\n\t" + "vpxor %0, %[X], %[X]\n\t" + + : "+xr" (tmp1), "=xr" (tmp2), [H] "+xr" (H), [X] "+xr" (X), + [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "xrm" (BSWAP_MASK), + [BSWAP_EPI64] "xrm" (BSWAP_EPI64), + [ONE] "xrm" (ONE), + [MOD2_128] "xrm" (MOD2_128) + : "memory" + ); + } + for (; k < (int)(nbytes/16); k++) { + __asm__ __volatile__ ( + "vpshufb %[BSWAP_EPI64], %[ctr1], %0\n\t" + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" + "vpxor (%[KEY]), %0, %0\n\t" + "vaesenc 16(%[KEY]), %0, %0\n\t" + "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" + "vaesenc 32(%[KEY]), %0, %0\n\t" + "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" + "vaesenc 48(%[KEY]), %0, %0\n\t" + "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" + "vaesenc 64(%[KEY]), %0, %0\n\t" + "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" + "vaesenc 80(%[KEY]), %0, %0\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc 96(%[KEY]), %0, %0\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vaesenc 112(%[KEY]), %0, %0\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vaesenc 128(%[KEY]), %0, %0\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vaesenc 144(%[KEY]), %0, %0\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %1\n\t" + "jl %=f\n\t" + "vaesenc %1, %0, %0\n\t" + "vaesenc 176(%[KEY]), %0, %0\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %1\n\t" + "jl %=f\n\t" + "vaesenc %1, %0, %0\n\t" + "vaesenc 208(%[KEY]), %0, %0\n\t" + "vmovaps 224(%[KEY]), %1\n\t" + "%=:\n\t" + "vaesenclast %1, %0, %0\n\t" + "vpxor (%[in]), %0, %0\n\t" + "vmovdqu %0, (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %0, %0\n\t" + "vpxor %0, %[X], %[X]\n\t" + + : "+xr" (tmp1), "=xr" (tmp2), [H] "+xr" (H), [X] "+xr" (X), + [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "xrm" (BSWAP_MASK), + [BSWAP_EPI64] "xrm" (BSWAP_EPI64), + [ONE] "xrm" (ONE), + [MOD2_128] "xrm" (MOD2_128) + : "xmm15", "xmm14", "xmm13", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } + if (k > 0) { + X = gfmul_shifted(X, H); + } +#endif + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + } + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + _mm_storeu_si128((__m128i*)tag, T); +} +#endif /* HAVE_INTEL_AVX2 */ + + +#ifdef HAVE_AES_DECRYPT +/* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */ + +static int AES_GCM_decrypt(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + const unsigned char *tag, int nbytes, int abytes, + int ibytes, const unsigned char* key, int nr) +{ + int i, j ,k; + __m128i H, Y, T; + __m128i *KEY = (__m128i*)key, lastKey; + __m128i ctr1; + __m128i last_block = _mm_setzero_si128(); + __m128i X = _mm_setzero_si128(); + __m128i tmp1, tmp2, XV; +#ifndef AES_GCM_AESNI_NO_UNROLL + __m128i HT[8]; + __m128i r0, r1; + __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; +#endif + + if (ibytes == 12) { + Y = _mm_setzero_si128(); + for (j=0; j < 12; j++) + ((unsigned char*)&Y)[j] = ivec[j]; + Y = _mm_insert_epi32(Y, 0x1000000, 3); + /* (Compute E[ZERO, KS] and E[Y0, KS] together */ + tmp1 = _mm_xor_si128(X, KEY[0]); + tmp2 = _mm_xor_si128(Y, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + lastKey = KEY[14]; + } + } + H = _mm_aesenclast_si128(tmp1, lastKey); + T = _mm_aesenclast_si128(tmp2, lastKey); + H = _mm_shuffle_epi8(H, BSWAP_MASK); + } + else { + if (ibytes % 16) { + i = ibytes / 16; + for (j=0; j < ibytes%16; j++) + ((unsigned char*)&last_block)[j] = ivec[i*16+j]; + } + tmp1 = _mm_xor_si128(X, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + H = _mm_aesenclast_si128(tmp1, lastKey); + H = _mm_shuffle_epi8(H, BSWAP_MASK); + + Y = _mm_setzero_si128(); + for (i=0; i < ibytes/16; i++) { + tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + } + if (ibytes % 16) { + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + } + tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, 0, 1); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ + tmp1 = _mm_xor_si128(Y, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + T = _mm_aesenclast_si128(tmp1, lastKey); + } + + for (i=0; i 10) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 =_mm_aesenclast_si128(tmp1, lastKey); + tmp2 =_mm_aesenclast_si128(tmp2, lastKey); + tmp3 =_mm_aesenclast_si128(tmp3, lastKey); + tmp4 =_mm_aesenclast_si128(tmp4, lastKey); + tmp5 =_mm_aesenclast_si128(tmp5, lastKey); + tmp6 =_mm_aesenclast_si128(tmp6, lastKey); + tmp7 =_mm_aesenclast_si128(tmp7, lastKey); + tmp8 =_mm_aesenclast_si128(tmp8, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); + tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); + tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); + tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); + tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); + tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); + tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); + tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); + _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); + _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); + _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); + _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); + _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); + _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); + _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); + _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); + } +#endif + for (k = i*8; k < nbytes/16; k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[k]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted(XV, H); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp2 = _mm_loadu_si128(&((__m128i*)in)[k]); + tmp1 = _mm_xor_si128(tmp1, tmp2); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + } + + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = _mm_setzero_si128(); + for (j=0; j < nbytes%16; j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + XV = last_block; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < nbytes%16; j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted(XV, H); + } + + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + /* 128 x 128 Carryless Multiply */ X = _mm_xor_si128(X, tmp1); - gfmul(X, H, &X); + X = gfmul_shifted(X, H); X = _mm_shuffle_epi8(X, BSWAP_MASK); T = _mm_xor_si128(X, T); @@ -3867,106 +5833,642 @@ static int AES_GCM_decrypt(const unsigned char *in, _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) return 0; /* in case the authentication failed */ - ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, ONE); - ctr2 = _mm_add_epi32(ctr1, ONE); - ctr3 = _mm_add_epi32(ctr2, ONE); - ctr4 = _mm_add_epi32(ctr3, ONE); + return 1; /* when successful returns 1 */ +} - for (i=0; i < nbytes/16/4; i++) { - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); - tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); - tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); +#ifdef HAVE_INTEL_AVX2 +static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + const unsigned char *tag, int nbytes, + int abytes, int ibytes, + const unsigned char* key, int nr) +{ + int i, j ,k; + __m128i H, Y, T; + __m128i *KEY = (__m128i*)key, lastKey; + __m128i ctr1; + __m128i last_block = _mm_setzero_si128(); + __m128i X = _mm_setzero_si128(); +#ifndef AES_GCM_AESNI_NO_UNROLL + __m128i HT[8]; + register __m128i tmp1 asm("xmm4"); + register __m128i tmp2 asm("xmm5"); + register __m128i tmp3 asm("xmm6"); + register __m128i tmp4 asm("xmm7"); + register __m128i tmp5 asm("xmm8"); + register __m128i tmp6 asm("xmm9"); + register __m128i tmp7 asm("xmm10"); + register __m128i tmp8 asm("xmm11"); + __m128i pctr1[1]; + register __m128i XV asm("xmm2"); +#else + __m128i tmp1, tmp2, XV; +#endif - ctr1 = _mm_add_epi32(ctr1, FOUR); - ctr2 = _mm_add_epi32(ctr2, FOUR); - ctr3 = _mm_add_epi32(ctr3, FOUR); - ctr4 = _mm_add_epi32(ctr4, FOUR); - - tmp1 =_mm_xor_si128(tmp1, KEY[0]); - tmp2 =_mm_xor_si128(tmp2, KEY[0]); - tmp3 =_mm_xor_si128(tmp3, KEY[0]); - tmp4 =_mm_xor_si128(tmp4, KEY[0]); - - for (j = 1; j < nr - 1; j += 2) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); - - tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]); + if (ibytes == 12) { + Y = _mm_setzero_si128(); + for (j=0; j < 12; j++) + ((unsigned char*)&Y)[j] = ivec[j]; + Y = _mm_insert_epi32(Y, 0x1000000, 3); + /* (Compute E[ZERO, KS] and E[Y0, KS] together */ + tmp1 = _mm_xor_si128(X, KEY[0]); + tmp2 = _mm_xor_si128(Y, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + lastKey = KEY[14]; + } } + H = _mm_aesenclast_si128(tmp1, lastKey); + T = _mm_aesenclast_si128(tmp2, lastKey); + H = _mm_shuffle_epi8(H, BSWAP_MASK); + } + else { + if (ibytes % 16) { + i = ibytes / 16; + for (j=0; j < ibytes%16; j++) + ((unsigned char*)&last_block)[j] = ivec[i*16+j]; + } + tmp1 = _mm_xor_si128(X, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + H = _mm_aesenclast_si128(tmp1, lastKey); + H = _mm_shuffle_epi8(H, BSWAP_MASK); - tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]); - - tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); - tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); - tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); - tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); - - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0])); - tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1])); - tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2])); - tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3])); - - _mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1); - _mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2); - _mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3); - _mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4); - - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); - tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); - tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); + Y = _mm_setzero_si128(); + for (i=0; i < ibytes/16; i++) { + tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + } + if (ibytes % 16) { + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + } + tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, 0, 1); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ + tmp1 = _mm_xor_si128(Y, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + T = _mm_aesenclast_si128(tmp1, lastKey); } - /* Acknowledge the dead store and continue */ - (void) tmp1; - (void) tmp2; - (void) tmp3; - (void) tmp4; + for (i=0; i 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } } - tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); - tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); - for(j=0; j < nbytes%16; j++) - ((unsigned char*)&last_block)[j]= in[k*16+j]; + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = _mm_setzero_si128(); + for (j=0; j < nbytes%16; j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + XV = last_block; tmp1 = _mm_xor_si128(tmp1, last_block); last_block = tmp1; - for (j = 0; j < nbytes % 16; j++) - out[k*16+j]=((unsigned char*)&last_block)[j]; + for (j=0; j < nbytes%16; j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted(XV, H); } + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + /* 128 x 128 Carryless Multiply */ + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + + if (0xffff != + _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) + return 0; /* in case the authentication failed */ + return 1; /* when successful returns 1 */ } +#endif /* HAVE_INTEL_AVX2 */ #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AESNI */ @@ -4659,8 +7161,17 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, #ifdef WOLFSSL_AESNI if (haveAESNI) { - AES_GCM_encrypt(in, out, authIn, iv, authTag, - sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); + #ifdef HAVE_INTEL_AVX2 + word32 intel_flags = cpuid_get_flags(); + + if (IS_INTEL_AVX2(intel_flags)) { + AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, + sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); + } + else + #endif + AES_GCM_encrypt(in, out, authIn, iv, authTag, + sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); return 0; } #endif @@ -4913,8 +7424,18 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, #ifdef WOLFSSL_AESNI if (haveAESNI) { - if (AES_GCM_decrypt(in, out, authIn, iv, authTag, - sz, authInSz, ivSz, (byte*)aes->key, aes->rounds) == 0) + #ifdef HAVE_INTEL_AVX2 + word32 intel_flags = cpuid_get_flags(); + + if (IS_INTEL_AVX2(intel_flags)) { + if (AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, + ivSz, (byte*)aes->key, aes->rounds) == 0) + return AES_GCM_AUTH_E; + } + else + #endif + if (AES_GCM_decrypt(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + (byte*)aes->key, aes->rounds) == 0) return AES_GCM_AUTH_E; return 0; } diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 3c625bc11..aaac07a01 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -1502,100 +1502,4 @@ MAKE_RK256_b: pxor xmm3,xmm2 ret - -; See Intel® Carry-Less Multiplication Instruction -; and its Usage for Computing the GCM Mode White Paper -; by Shay Gueron, Intel Mobility Group, Israel Development Center; -; and Michael E. Kounavis, Intel Labs, Circuits and Systems Research - -; void gfmul(__m128i a, __m128i b, __m128i* out); - -; .globl gfmul -gfmul PROC - ; xmm0 holds operand a (128 bits) - ; xmm1 holds operand b (128 bits) - ; r8 holds the pointer to output (128 bits) - - ; convert to what we had for att&t convention - movdqa xmm0, [rcx] - movdqa xmm1, [rdx] - - ; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - - movdqa xmm3, xmm0 - pclmulqdq xmm3, xmm1, 0 ; xmm3 holds a0*b0 - movdqa xmm4, xmm0 - pclmulqdq xmm4, xmm1, 16 ; xmm4 holds a0*b1 - movdqa xmm5, xmm0 - pclmulqdq xmm5, xmm1, 1 ; xmm5 holds a1*b0 - movdqa xmm6, xmm0 - pclmulqdq xmm6, xmm1, 17 ; xmm6 holds a1*b1 - pxor xmm4, xmm5 ; xmm4 holds a0*b1 + a1*b0 - movdqa xmm5, xmm4 - psrldq xmm4, 8 - pslldq xmm5, 8 - pxor xmm3, xmm5 - pxor xmm6, xmm4 ; holds the result of - ; the carry-less multiplication of - ; xmm0 by xmm1 - -; shift the result by one bit position to the left cope for the fact -; that bits are reversed - movdqa xmm7, xmm3 - movdqa xmm8, xmm6 - pslld xmm3, 1 - pslld xmm6, 1 - psrld xmm7, 31 - psrld xmm8, 31 - movdqa xmm9, xmm7 - pslldq xmm8, 4 - pslldq xmm7, 4 - psrldq xmm9, 12 - por xmm3, xmm7 - por xmm6, xmm8 - por xmm6, xmm9 - -; first phase of the reduction - movdqa xmm7, xmm3 - movdqa xmm8, xmm3 - movdqa xmm9, xmm3 - pslld xmm7, 31 ; packed right shifting << 31 - pslld xmm8, 30 ; packed right shifting shift << 30 - pslld xmm9, 25 ; packed right shifting shift << 25 - pxor xmm7, xmm8 ; xor the shifted versions - pxor xmm7, xmm9 - - movdqa xmm8, xmm7 - pslldq xmm7, 12 - psrldq xmm8, 4 - pxor xmm3, xmm7 ; first phase of the reduction complete - movdqa xmm2, xmm3 ; second phase of the reduction - movdqa xmm4, xmm3 - movdqa xmm5, xmm3 - psrld xmm2, 1 ; packed left shifting >> 1 - psrld xmm4, 2 ; packed left shifting >> 2 - psrld xmm5, 7 ; packed left shifting >> 7 - - pxor xmm2, xmm4 ; xor the shifted versions - pxor xmm2, xmm5 - pxor xmm2, xmm8 - pxor xmm3, xmm2 - pxor xmm6, xmm3 ; the result is in xmm6 - movdqu [r8],xmm6 ; store the result - - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - add rsp,8+4*16 ; 8 = align stack , 4 xmm6-9 16 bytes each - - ret -gfmul ENDP - END diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s index e47b3469e..0be550b2f 100644 --- a/wolfcrypt/src/aes_asm.s +++ b/wolfcrypt/src/aes_asm.s @@ -1288,91 +1288,6 @@ pxor %xmm4, %xmm3 pxor %xmm2, %xmm3 ret - -#ifdef HAVE_AESGCM - -/* See Intel® Carry-Less Multiplication Instruction - * and its Usage for Computing the GCM Mode White Paper - * by Shay Gueron, Intel Mobility Group, Israel Development Center; - * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research - * - * This is for use with the C code. - */ - -/* Figure 6. Code Sample - Performing Ghash Using Algorithms 1 and 5 */ - -/* - * void gfmul(__m128i a, __m128i b, __m128i* out); - */ -.globl gfmul -gfmul: - #xmm0 holds operand a (128 bits) - #xmm1 holds operand b (128 bits) - #rdi holds the pointer to output (128 bits) - movdqa %xmm0, %xmm3 - pclmulqdq $0, %xmm1, %xmm3 # xmm3 holds a0*b0 - movdqa %xmm0, %xmm4 - pclmulqdq $16, %xmm1, %xmm4 # xmm4 holds a0*b1 - movdqa %xmm0, %xmm5 - pclmulqdq $1, %xmm1, %xmm5 # xmm5 holds a1*b0 - movdqa %xmm0, %xmm6 - pclmulqdq $17, %xmm1, %xmm6 # xmm6 holds a1*b1 - pxor %xmm5, %xmm4 # xmm4 holds a0*b1 + a1*b0 - movdqa %xmm4, %xmm5 - psrldq $8, %xmm4 - pslldq $8, %xmm5 - pxor %xmm5, %xmm3 - pxor %xmm4, %xmm6 # holds the result of - # the carry-less multiplication of - # xmm0 by xmm1 - -# shift the result by one bit position to the left cope for the fact -# that bits are reversed - movdqa %xmm3, %xmm7 - movdqa %xmm6, %xmm8 - pslld $1, %xmm3 - pslld $1, %xmm6 - psrld $31, %xmm7 - psrld $31, %xmm8 - movdqa %xmm7, %xmm9 - pslldq $4, %xmm8 - pslldq $4, %xmm7 - psrldq $12, %xmm9 - por %xmm7, %xmm3 - por %xmm8, %xmm6 - por %xmm9, %xmm6 - -# first phase of the reduction - movdqa %xmm3, %xmm7 - movdqa %xmm3, %xmm8 - movdqa %xmm3, %xmm9 - pslld $31, %xmm7 # packed right shifting << 31 - pslld $30, %xmm8 # packed right shifting shift << 30 - pslld $25, %xmm9 # packed right shifting shift << 25 - pxor %xmm8, %xmm7 # xor the shifted versions - pxor %xmm9, %xmm7 - - movdqa %xmm7, %xmm8 - pslldq $12, %xmm7 - psrldq $4, %xmm8 - pxor %xmm7, %xmm3 # first phase of the reduction complete - movdqa %xmm3,%xmm2 # second phase of the reduction - movdqa %xmm3,%xmm4 - movdqa %xmm3,%xmm5 - psrld $1, %xmm2 # packed left shifting >> 1 - psrld $2, %xmm4 # packed left shifting >> 2 - psrld $7, %xmm5 # packed left shifting >> 7 - - pxor %xmm4, %xmm2 # xor the shifted versions - pxor %xmm5, %xmm2 - pxor %xmm8, %xmm2 - pxor %xmm2, %xmm3 - pxor %xmm3, %xmm6 # the result is in xmm6 - movdqu %xmm6, (%rdi) # store the result - ret - -#endif /* HAVE_AESGCM */ - #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c new file mode 100644 index 000000000..087217864 --- /dev/null +++ b/wolfcrypt/src/cpuid.c @@ -0,0 +1,99 @@ +/* cpuid.c + * + * Copyright (C) 2006-2016 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#include + +#ifdef WOLFSSL_X86_64_BUILD + /* Each platform needs to query info type 1 from cpuid to see if aesni is + * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts + */ + + #ifndef _MSC_VER + #define cpuid(reg, leaf, sub)\ + __asm__ __volatile__ ("cpuid":\ + "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ + "a" (leaf), "c"(sub)); + + #define XASM_LINK(f) asm(f) + #else + + #include + #define cpuid(a,b) __cpuid((int*)a,b) + + #define XASM_LINK(f) + #endif /* _MSC_VER */ + + #define EAX 0 + #define EBX 1 + #define ECX 2 + #define EDX 3 + + static word32 cpuid_check = 0; + static word32 cpuid_flags = 0; + + static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) + { + static int got_intel_cpu = 0; + static unsigned int reg[5]; + + reg[4] = '\0'; + cpuid(reg, 0, 0); + if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 && + XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 && + XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) { + got_intel_cpu = 1; + } + if (got_intel_cpu) { + cpuid(reg, leaf, sub); + return ((reg[num] >> bit) & 0x1); + } + return 0; + } + + + void cpuid_set_flags(void) + { + if (!cpuid_check) { + cpuid_check = 1; + if (cpuid_flag(1, 0, ECX, 28)) { cpuid_flags |= CPUID_AVX1 ; } + if (cpuid_flag(7, 0, EBX, 5)) { cpuid_flags |= CPUID_AVX2 ; } + if (cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; } + if (cpuid_flag(1, 0, ECX, 30)) { cpuid_flags |= CPUID_RDRAND; } + if (cpuid_flag(7, 0, EBX, 18)) { cpuid_flags |= CPUID_RDSEED; } + if (cpuid_flag(1, 0, ECX, 26)) { cpuid_flags |= CPUID_AESNI ; } + } + } + + word32 cpuid_get_flags(void) + { + if (!cpuid_check) + cpuid_set_flags(); + return cpuid_flags; + } +#endif + diff --git a/wolfcrypt/src/random.c b/wolfcrypt/src/random.c index 2459f3de3..705578f1e 100644 --- a/wolfcrypt/src/random.c +++ b/wolfcrypt/src/random.c @@ -32,6 +32,7 @@ */ #include +#include #ifdef HAVE_FIPS @@ -141,12 +142,6 @@ int wc_RNG_GenerateByte(WC_RNG* rng, byte* b) #ifdef HAVE_INTEL_RDRAND static int wc_GenerateRand_IntelRD(OS_Seed* os, byte* output, word32 sz); #endif - static word32 cpuid_check = 0; - static word32 cpuid_flags = 0; - #define CPUID_RDRAND 0x4 - #define CPUID_RDSEED 0x8 - #define IS_INTEL_RDRAND (cpuid_flags & CPUID_RDRAND) - #define IS_INTEL_RDSEED (cpuid_flags & CPUID_RDSEED) #endif /* Start NIST DRBG code */ @@ -540,7 +535,7 @@ int wc_InitRng_ex(WC_RNG* rng, void* heap, int devId) #ifdef HAVE_INTEL_RDRAND /* if CPU supports RDRAND, use it directly and by-pass DRBG init */ - if (IS_INTEL_RDRAND) + if (IS_INTEL_RDRAND(cpuid_get_flags())) return 0; #endif @@ -610,7 +605,7 @@ int wc_RNG_GenerateBlock(WC_RNG* rng, byte* output, word32 sz) return BAD_FUNC_ARG; #ifdef HAVE_INTEL_RDRAND - if (IS_INTEL_RDRAND) + if (IS_INTEL_RDRAND(cpuid_get_flags())) return wc_GenerateRand_IntelRD(NULL, output, sz); #endif @@ -982,52 +977,8 @@ int wc_FreeNetRandom(void) #if defined(HAVE_INTEL_RDRAND) || defined(HAVE_INTEL_RDSEED) -#ifndef _MSC_VER - #define cpuid(reg, leaf, sub)\ - __asm__ __volatile__ ("cpuid":\ - "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ - "a" (leaf), "c"(sub)); - - #define XASM_LINK(f) asm(f) -#else - - #include - #define cpuid(a,b) __cpuid((int*)a,b) - - #define XASM_LINK(f) - -#endif /* _MSC_VER */ - -#define EAX 0 -#define EBX 1 -#define ECX 2 -#define EDX 3 - -static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { - int got_intel_cpu = 0; - unsigned int reg[5]; - - reg[4] = '\0'; - cpuid(reg, 0, 0); - if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 && - XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 && - XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) - { - got_intel_cpu = 1; - } - if (got_intel_cpu) { - cpuid(reg, leaf, sub); - return ((reg[num] >> bit) & 0x1); - } - return 0; -} - static void wc_InitRng_IntelRD(void) { - if (cpuid_check==0) { - if (cpuid_flag(1, 0, ECX, 30)) { cpuid_flags |= CPUID_RDRAND; } - if (cpuid_flag(7, 0, EBX, 18)) { cpuid_flags |= CPUID_RDSEED; } - cpuid_check = 1; - } + cpuid_set_flags(); } #ifdef WOLFSSL_ASYNC_CRYPT @@ -1067,7 +1018,7 @@ static int wc_GenerateSeed_IntelRD(OS_Seed* os, byte* output, word32 sz) (void)os; - if (!IS_INTEL_RDSEED) + if (!IS_INTEL_RDSEED(cpuid_get_flags())) return -1; for (; (sz / sizeof(word64)) > 0; sz -= sizeof(word64), @@ -1122,7 +1073,7 @@ static int wc_GenerateRand_IntelRD(OS_Seed* os, byte* output, word32 sz) (void)os; - if (!IS_INTEL_RDRAND) + if (!IS_INTEL_RDRAND(cpuid_get_flags())) return -1; for (; (sz / sizeof(word64)) > 0; sz -= sizeof(word64), @@ -1702,7 +1653,7 @@ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz) int ret = 0; #ifdef HAVE_INTEL_RDSEED - if (IS_INTEL_RDSEED) { + if (IS_INTEL_RDSEED(cpuid_get_flags())) { ret = wc_GenerateSeed_IntelRD(NULL, output, sz); if (ret == 0) { /* success, we're done */ diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index 5ed2a51b1..3c137a6c4 100755 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -32,6 +32,7 @@ #include #include +#include /* fips wrapper calls, user can call direct */ #ifdef HAVE_FIPS @@ -177,77 +178,14 @@ static int InitSha256(Sha256* sha256) More granural Stitched Message Sched/Round } + #endif + */ /* Each platform needs to query info type 1 from cpuid to see if aesni is * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts */ - #ifndef _MSC_VER - #define cpuid(reg, leaf, sub)\ - __asm__ __volatile__ ("cpuid":\ - "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ - "a" (leaf), "c"(sub)); - - #define XASM_LINK(f) asm(f) - #else - #include - #define cpuid(a,b) __cpuid((int*)a,b) - - #define XASM_LINK(f) - #endif /* _MSC_VER */ - - #define EAX 0 - #define EBX 1 - #define ECX 2 - #define EDX 3 - - #define CPUID_AVX1 0x1 - #define CPUID_AVX2 0x2 - #define CPUID_RDRAND 0x4 - #define CPUID_RDSEED 0x8 - #define CPUID_BMI2 0x10 /* MULX, RORX */ - - #define IS_INTEL_AVX1 (cpuid_flags & CPUID_AVX1) - #define IS_INTEL_AVX2 (cpuid_flags & CPUID_AVX2) - #define IS_INTEL_BMI2 (cpuid_flags & CPUID_BMI2) - #define IS_INTEL_RDRAND (cpuid_flags & CPUID_RDRAND) - #define IS_INTEL_RDSEED (cpuid_flags & CPUID_RDSEED) - - static word32 cpuid_check = 0; - static word32 cpuid_flags = 0; - - static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { - int got_intel_cpu=0; - unsigned int reg[5]; - - reg[4] = '\0'; - cpuid(reg, 0, 0); - if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 && - XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 && - XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) { - got_intel_cpu = 1; - } - if (got_intel_cpu) { - cpuid(reg, leaf, sub); - return ((reg[num] >> bit) & 0x1); - } - return 0; - } - - static int set_cpuid_flags(void) { - if (cpuid_check==0) { - if (cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1; } - if (cpuid_flag(7, 0, EBX, 5)) { cpuid_flags |= CPUID_AVX2; } - if (cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2; } - if (cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND; } - if (cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED; } - cpuid_check = 1; - return 0; - } - return 1; - } - /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */ static int Transform(Sha256* sha256); #if defined(HAVE_INTEL_AVX1) @@ -258,22 +196,31 @@ static int InitSha256(Sha256* sha256) static int Transform_AVX1_RORX(Sha256 *sha256); #endif static int (*Transform_p)(Sha256* sha256) /* = _Transform */; + static int transform_check = 0; #define XTRANSFORM(sha256, B) (*Transform_p)(sha256) - static void set_Transform(void) { - if (set_cpuid_flags()) return; + static void set_Transform(void) + { + word32 intel_flags; + + cpuid_set_flags(); + if (transform_check) + return; + transform_check = 1; + intel_flags = cpuid_get_flags(); #if defined(HAVE_INTEL_AVX2) - if (IS_INTEL_AVX2 && IS_INTEL_BMI2) { - Transform_p = Transform_AVX1_RORX; return; - Transform_p = Transform_AVX2; - /* for avoiding warning,"not used" */ - } + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) { + Transform_p = Transform_AVX1_RORX; return; + Transform_p = Transform_AVX2; + /* for avoiding warning,"not used" */ + } #endif #if defined(HAVE_INTEL_AVX1) - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform); return; + Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 : + Transform); return; #endif - Transform_p = Transform; return; + Transform_p = Transform; return; } /* Dummy for saving MM_REGs on behalf of Transform */ @@ -519,6 +466,11 @@ static int InitSha256(Sha256* sha256) { int ret = 0; byte* local; +#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + word32 intel_flags = cpuid_get_flags(); +#endif +#endif if (sha256 == NULL || (data == NULL && len > 0)) { return BAD_FUNC_ARG; @@ -552,7 +504,7 @@ static int InitSha256(Sha256* sha256) if (sha256->buffLen == SHA256_BLOCK_SIZE) { #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) #endif { ByteReverseWords(sha256->buffer, sha256->buffer, @@ -582,6 +534,11 @@ static int InitSha256(Sha256* sha256) int ret; byte* local = (byte*)sha256->buffer; +#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + word32 intel_flags = cpuid_get_flags(); +#endif +#endif if (sha256 == NULL) { return BAD_FUNC_ARG; @@ -598,15 +555,15 @@ static int InitSha256(Sha256* sha256) SHA256_BLOCK_SIZE - sha256->buffLen); sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen; - #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) - #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2) - #endif { - ByteReverseWords(sha256->buffer, sha256->buffer, - SHA256_BLOCK_SIZE); + #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) + #endif + ByteReverseWords(sha256->buffer, sha256->buffer, + SHA256_BLOCK_SIZE); + #endif } - #endif ret = XTRANSFORM(sha256, local); if (ret != 0) @@ -624,7 +581,7 @@ static int InitSha256(Sha256* sha256) /* store lengths */ #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) #endif { ByteReverseWords(sha256->buffer, sha256->buffer, @@ -640,7 +597,7 @@ static int InitSha256(Sha256* sha256) defined(HAVE_INTEL_AVX2) /* Kinetis requires only these bytes reversed */ #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if (IS_INTEL_AVX1 || IS_INTEL_AVX2) + if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags)) #endif { ByteReverseWords( diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index cb1fe3aed..0b19795cb 100755 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -27,10 +27,9 @@ #include #ifdef WOLFSSL_SHA512 -#include - #include #include +#include /* fips wrapper calls, user can call direct */ #ifdef HAVE_FIPS @@ -261,74 +260,6 @@ static int InitSha512(Sha512* sha512) * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts */ - #ifndef _MSC_VER - #define cpuid(reg, leaf, sub)\ - __asm__ __volatile__ ("cpuid":\ - "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ - "a" (leaf), "c"(sub)); - - #define XASM_LINK(f) asm(f) - #else - - #include - #define cpuid(a,b) __cpuid((int*)a,b) - - #define XASM_LINK(f) - #endif /* _MSC_VER */ - - #define EAX 0 - #define EBX 1 - #define ECX 2 - #define EDX 3 - - #define CPUID_AVX1 0x1 - #define CPUID_AVX2 0x2 - #define CPUID_RDRAND 0x4 - #define CPUID_RDSEED 0x8 - #define CPUID_BMI2 0x10 /* MULX, RORX */ - - #define IS_INTEL_AVX1 (cpuid_flags & CPUID_AVX1) - #define IS_INTEL_AVX2 (cpuid_flags & CPUID_AVX2) - #define IS_INTEL_BMI2 (cpuid_flags & CPUID_BMI2) - #define IS_INTEL_RDRAND (cpuid_flags & CPUID_RDRAND) - #define IS_INTEL_RDSEED (cpuid_flags & CPUID_RDSEED) - - static word32 cpuid_check = 0; - static word32 cpuid_flags = 0; - - static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { - int got_intel_cpu = 0; - unsigned int reg[5]; - - reg[4] = '\0'; - cpuid(reg, 0, 0); - if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 && - XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 && - XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) { - got_intel_cpu = 1; - } - if (got_intel_cpu) { - cpuid(reg, leaf, sub); - return ((reg[num] >> bit) & 0x1); - } - return 0; - } - - - static int set_cpuid_flags() { - if(cpuid_check ==0) { - if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} - if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } - if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; } - if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } - if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } - cpuid_check = 1 ; - return 0 ; - } - return 1 ; - } - - #if defined(HAVE_INTEL_AVX1) static int Transform_AVX1(Sha512 *sha512); #endif @@ -340,6 +271,7 @@ static int InitSha512(Sha512* sha512) #endif static int _Transform(Sha512 *sha512); static int (*Transform_p)(Sha512* sha512) = _Transform; + static int transform_check = 0; #define Transform(sha512) (*Transform_p)(sha512) /* Dummy for saving MM_REGs on behalf of Transform */ @@ -353,6 +285,28 @@ static int InitSha512(Sha512* sha512) "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15") #endif + static void Sha512_SetTransform() + { + word32 intel_flags; + + if (transform_check) + return; + transform_check = 1; + intel_flags = cpuid_get_flags(); + + #if defined(HAVE_INTEL_AVX2) + if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) { + Transform_p = Transform_AVX1_RORX; return; + Transform_p = Transform_AVX2; + /* for avoiding warning,"not used" */ + } + #endif + #if defined(HAVE_INTEL_AVX1) + Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 : + _Transform); return; + #endif + Transform_p = _Transform; + } int wc_InitSha512_ex(Sha512* sha512, void* heap, int devId) { @@ -361,20 +315,7 @@ static int InitSha512(Sha512* sha512) (void)heap; (void)devId; - if (set_cpuid_flags()) - return ret; - - #if defined(HAVE_INTEL_AVX2) - if (IS_INTEL_AVX2 && IS_INTEL_BMI2) { - Transform_p = Transform_AVX1_RORX; return ret; - Transform_p = Transform_AVX2; - /* for avoiding warning,"not used" */ - } - #endif - #if defined(HAVE_INTEL_AVX1) - Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform); return ret; - #endif - Transform_p = _Transform; + Sha512_SetTransform(); return ret; } @@ -554,6 +495,11 @@ static INLINE int Sha512Update(Sha512* sha512, const byte* data, word32 len) int ret = 0; /* do block size increments */ byte* local = (byte*)sha512->buffer; +#if defined(LITTLE_ENDIAN_ORDER) +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + word32 intel_flags = cpuid_get_flags(); +#endif +#endif if (sha512 == NULL || (data == NULL && len > 0)) { return BAD_FUNC_ARG; @@ -570,16 +516,18 @@ static INLINE int Sha512Update(Sha512* sha512, const byte* data, word32 len) XMEMCPY(&local[sha512->buffLen], data, add); sha512->buffLen += add; - data += add; - len -= add; + data += add; + len -= add; if (sha512->buffLen == SHA512_BLOCK_SIZE) { #if defined(LITTLE_ENDIAN_ORDER) #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) #endif + { ByteReverseWords64(sha512->buffer, sha512->buffer, - SHA512_BLOCK_SIZE); + SHA512_BLOCK_SIZE); + } #endif ret = Transform(sha512); if (ret != 0) @@ -615,6 +563,11 @@ static INLINE int Sha512Final(Sha512* sha512) { byte* local = (byte*)sha512->buffer; int ret; +#if defined(LITTLE_ENDIAN_ORDER) +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + word32 intel_flags = cpuid_get_flags(); +#endif +#endif if (sha512 == NULL) { return BAD_FUNC_ARG; @@ -629,13 +582,15 @@ static INLINE int Sha512Final(Sha512* sha512) if (sha512->buffLen > SHA512_PAD_SIZE) { XMEMSET(&local[sha512->buffLen], 0, SHA512_BLOCK_SIZE - sha512->buffLen); sha512->buffLen += SHA512_BLOCK_SIZE - sha512->buffLen; -#if defined(LITTLE_ENDIAN_ORDER) - #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2) - #endif - ByteReverseWords64(sha512->buffer,sha512->buffer,SHA512_BLOCK_SIZE); - -#endif /* LITTLE_ENDIAN_ORDER */ + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) + #endif + { + ByteReverseWords64(sha512->buffer,sha512->buffer, + SHA512_BLOCK_SIZE); + } + #endif /* LITTLE_ENDIAN_ORDER */ ret = Transform(sha512); if (ret != 0) return ret; @@ -651,17 +606,19 @@ static INLINE int Sha512Final(Sha512* sha512) /* store lengths */ #if defined(LITTLE_ENDIAN_ORDER) -#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2) -#endif - ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE); + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) + #endif + { + ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE); + } #endif /* ! length ordering dependent on digest endian type ! */ sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen; sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen; #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) - if (IS_INTEL_AVX1 || IS_INTEL_AVX2) + if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags)) ByteReverseWords64(&(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]), &(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]), SHA512_BLOCK_SIZE - SHA512_PAD_SIZE); @@ -1470,6 +1427,21 @@ int wc_Sha384Final(Sha384* sha384, byte* hash) } +/* Hardware Acceleration */ +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + + int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId) + { + int ret = InitSha384(sha384); + + (void)heap; + (void)devId; + + Sha512_SetTransform(); + + return ret; + } +#else int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId) { int ret; @@ -1492,6 +1464,7 @@ int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId) return ret; } +#endif int wc_InitSha384(Sha384* sha384) { diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 08c7e9574..a7dd79e6d 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -195,6 +195,12 @@ static int devId = INVALID_DEVID; const char* wnrConfigFile = "wnr-example.conf"; #endif +#ifdef HAVE_AESGCM +#define LARGE_BUFFER_SIZE 1024 +static byte large_input[LARGE_BUFFER_SIZE]; +static byte large_output[LARGE_BUFFER_SIZE]; +static byte large_outdec[LARGE_BUFFER_SIZE]; +#endif typedef struct testVector { const char* input; @@ -375,6 +381,9 @@ int wolfcrypt_test(void* args) #endif { int ret; +#ifdef HAVE_AESGCM + int i; +#endif ((func_args*)args)->return_code = -1; /* error state */ @@ -665,6 +674,8 @@ int wolfcrypt_test(void* args) printf( "AES256 test passed!\n"); #ifdef HAVE_AESGCM + for (i=0; i + + +#ifdef __cplusplus + extern "C" { +#endif + +#ifdef WOLFSSL_X86_64_BUILD + #define CPUID_AVX1 0x0001 + #define CPUID_AVX2 0x0002 + #define CPUID_RDRAND 0x0004 + #define CPUID_RDSEED 0x0008 + #define CPUID_BMI2 0x0010 /* MULX, RORX */ + #define CPUID_AESNI 0x0020 + + #define IS_INTEL_AVX1(f) ((f) & CPUID_AVX1) + #define IS_INTEL_AVX2(f) ((f) & CPUID_AVX2) + #define IS_INTEL_RDRAND(f) ((f) & CPUID_RDRAND) + #define IS_INTEL_RDSEED(f) ((f) & CPUID_RDSEED) + #define IS_INTEL_BMI2(f) ((f) & CPUID_BMI2) + #define IS_INTEL_AESNI(f) ((f) & CPUID_AESNI) + + void cpuid_set_flags(void); + word32 cpuid_get_flags(void); +#endif + +#ifdef __cplusplus + } /* extern "C" */ +#endif + + +#endif /* WOLF_CRYPT_CPUID_H */ diff --git a/wolfssl/wolfcrypt/include.am b/wolfssl/wolfcrypt/include.am index a2a741f34..ca277093c 100644 --- a/wolfssl/wolfcrypt/include.am +++ b/wolfssl/wolfcrypt/include.am @@ -60,7 +60,8 @@ nobase_include_HEADERS+= \ wolfssl/wolfcrypt/wolfevent.h \ wolfssl/wolfcrypt/pkcs12.h \ wolfssl/wolfcrypt/wolfmath.h \ - wolfssl/wolfcrypt/sha3.h + wolfssl/wolfcrypt/sha3.h \ + wolfssl/wolfcrypt/cpuid.h noinst_HEADERS+= \ wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h \