From 6c848e7133ef66c93f35f46a2cc1e72c834b6ccb Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 13 Nov 2017 14:30:31 +1000 Subject: [PATCH] Improve AES-GCM performance on AVX1 (IvyBridge) and AVX2 --- wolfcrypt/src/aes.c | 2695 ++++++++++++++++++++++--------------------- 1 file changed, 1376 insertions(+), 1319 deletions(-) diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 2684d8a8d..3123006e5 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -3420,39 +3420,6 @@ static const __m128i MOD2_128 = M128_INIT(0x1, 0xc200000000000000UL); static __m128i gfmul_sw(__m128i a, __m128i b) { __m128i r, t1, t2, t3, t4, t5, t6, t7; -#ifndef WOLFSSL_AES_GCM_SLOW_CLMUL - /* 128 x 128 Carryless Multiply */ - t3 = _mm_clmulepi64_si128(a, b, 0x10); - t2 = _mm_clmulepi64_si128(a, b, 0x01); - t1 = _mm_clmulepi64_si128(a, b, 0x00); - t4 = _mm_clmulepi64_si128(a, b, 0x11); - t3 = _mm_xor_si128(t3, t2); - t2 = _mm_slli_si128(t3, 8); - t3 = _mm_srli_si128(t3, 8); - t1 = _mm_xor_si128(t1, t2); - t4 = _mm_xor_si128(t4, t3); - - /* shift left 1 bit - bits reversed */ - t5 = _mm_srli_epi32(t1, 31); - t6 = _mm_srli_epi32(t4, 31); - t1 = _mm_slli_epi32(t1, 1); - t4 = _mm_slli_epi32(t4, 1); - t7 = _mm_srli_si128(t5, 12); - t5 = _mm_slli_si128(t5, 4); - t6 = _mm_slli_si128(t6, 4); - t4 = _mm_or_si128(t4, t7); - t1 = _mm_or_si128(t1, t5); - t4 = _mm_or_si128(t4, t6); - - /* Reduction */ - t2 = _mm_clmulepi64_si128(t1, MOD2_128, 0x10); - t3 = _mm_shuffle_epi32(t1, 78); - t3 = _mm_xor_si128(t3, t2); - t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10); - t3 = _mm_shuffle_epi32(t3, 78); - t3 = _mm_xor_si128(t3, t2); - r = _mm_xor_si128(t4, t3); -#else t2 = _mm_shuffle_epi32(b, 78); t3 = _mm_shuffle_epi32(a, 78); t2 = _mm_xor_si128(t2, b); @@ -3496,27 +3463,16 @@ static __m128i gfmul_sw(__m128i a, __m128i b) t7 = _mm_xor_si128(t7, t6); t7 = _mm_xor_si128(t7, t1); r = _mm_xor_si128(t4, t7); -#endif return r; } + static void gfmul_only(__m128i a, __m128i b, __m128i* r0, __m128i* r1) { __m128i t1, t2, t3, t4; /* 128 x 128 Carryless Multiply */ -#ifndef WOLFSSL_AES_GCM_SLOW_CLMUL - t3 = _mm_clmulepi64_si128(a, b, 0x10); - t2 = _mm_clmulepi64_si128(a, b, 0x01); - t1 = _mm_clmulepi64_si128(a, b, 0x00); - t4 = _mm_clmulepi64_si128(a, b, 0x11); - t3 = _mm_xor_si128(t3, t2); - t2 = _mm_slli_si128(t3, 8); - t3 = _mm_srli_si128(t3, 8); - t1 = _mm_xor_si128(t1, t2); - t4 = _mm_xor_si128(t4, t3); -#else t2 = _mm_shuffle_epi32(b, 78); t3 = _mm_shuffle_epi32(a, 78); t2 = _mm_xor_si128(t2, b); @@ -3530,7 +3486,6 @@ static void gfmul_only(__m128i a, __m128i b, __m128i* r0, __m128i* r1) t2 = _mm_srli_si128(t2, 8); t1 = _mm_xor_si128(t1, t3); t4 = _mm_xor_si128(t4, t2); -#endif *r0 = _mm_xor_si128(t1, *r0); *r1 = _mm_xor_si128(t4, *r1); } @@ -3553,15 +3508,6 @@ static __m128i gfmul_shl1(__m128i a) static __m128i ghash_red(__m128i r0, __m128i r1) { __m128i t2, t3; -#ifndef WOLFSSL_AES_GCM_SLOW_CLMUL - t2 = _mm_clmulepi64_si128(r0, MOD2_128, 0x10); - t3 = _mm_shuffle_epi32(r0, 78); - t3 = _mm_xor_si128(t3, t2); - t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10); - t3 = _mm_shuffle_epi32(t3, 78); - t3 = _mm_xor_si128(t3, t2); - return _mm_xor_si128(r1, t3); -#else __m128i t5, t6, t7; t5 = _mm_slli_epi32(r0, 31); @@ -3582,7 +3528,6 @@ static __m128i ghash_red(__m128i r0, __m128i r1) t7 = _mm_xor_si128(t7, t6); t7 = _mm_xor_si128(t7, r0); return _mm_xor_si128(r1, t7); -#endif } static __m128i gfmul_shifted(__m128i a, __m128i b) @@ -3611,6 +3556,103 @@ static __m128i gfmul8(__m128i a1, __m128i a2, __m128i a3, __m128i a4, } #endif +#ifdef HAVE_INTEL_AVX2 +static __m128i gfmul_sw_avx2(__m128i a, __m128i b) +{ + __m128i r, t1, t2, t3, t4, t5, t6, t7; + /* 128 x 128 Carryless Multiply */ + t3 = _mm_clmulepi64_si128(a, b, 0x10); + t2 = _mm_clmulepi64_si128(a, b, 0x01); + t1 = _mm_clmulepi64_si128(a, b, 0x00); + t4 = _mm_clmulepi64_si128(a, b, 0x11); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_slli_si128(t3, 8); + t3 = _mm_srli_si128(t3, 8); + t1 = _mm_xor_si128(t1, t2); + t4 = _mm_xor_si128(t4, t3); + + /* shift left 1 bit - bits reversed */ + t5 = _mm_srli_epi32(t1, 31); + t6 = _mm_srli_epi32(t4, 31); + t1 = _mm_slli_epi32(t1, 1); + t4 = _mm_slli_epi32(t4, 1); + t7 = _mm_srli_si128(t5, 12); + t5 = _mm_slli_si128(t5, 4); + t6 = _mm_slli_si128(t6, 4); + t4 = _mm_or_si128(t4, t7); + t1 = _mm_or_si128(t1, t5); + t4 = _mm_or_si128(t4, t6); + + /* Reduction */ + t2 = _mm_clmulepi64_si128(t1, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t1, 78); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t3, 78); + t3 = _mm_xor_si128(t3, t2); + r = _mm_xor_si128(t4, t3); + + return r; +} + +static void gfmul_only_avx2(__m128i a, __m128i b, __m128i* r0, __m128i* r1) +{ + __m128i t1, t2, t3, t4; + + /* 128 x 128 Carryless Multiply */ + t3 = _mm_clmulepi64_si128(a, b, 0x10); + t2 = _mm_clmulepi64_si128(a, b, 0x01); + t1 = _mm_clmulepi64_si128(a, b, 0x00); + t4 = _mm_clmulepi64_si128(a, b, 0x11); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_slli_si128(t3, 8); + t3 = _mm_srli_si128(t3, 8); + t1 = _mm_xor_si128(t1, t2); + t4 = _mm_xor_si128(t4, t3); + *r0 = _mm_xor_si128(t1, *r0); + *r1 = _mm_xor_si128(t4, *r1); +} + +static __m128i ghash_red_avx2(__m128i r0, __m128i r1) +{ + __m128i t2, t3; + t2 = _mm_clmulepi64_si128(r0, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(r0, 78); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t3, 78); + t3 = _mm_xor_si128(t3, t2); + return _mm_xor_si128(r1, t3); +} + +static __m128i gfmul_shifted_avx2(__m128i a, __m128i b) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only_avx2(a, b, &t0, &t1); + return ghash_red_avx2(t0, t1); +} + +#ifndef AES_GCM_AESNI_NO_UNROLL +static __m128i gfmul8_avx2(__m128i a1, __m128i a2, __m128i a3, __m128i a4, + __m128i a5, __m128i a6, __m128i a7, __m128i a8, + __m128i b1, __m128i b2, __m128i b3, __m128i b4, + __m128i b5, __m128i b6, __m128i b7, __m128i b8) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only_avx2(a1, b8, &t0, &t1); + gfmul_only_avx2(a2, b7, &t0, &t1); + gfmul_only_avx2(a3, b6, &t0, &t1); + gfmul_only_avx2(a4, b5, &t0, &t1); + gfmul_only_avx2(a5, b4, &t0, &t1); + gfmul_only_avx2(a6, b3, &t0, &t1); + gfmul_only_avx2(a7, b2, &t0, &t1); + gfmul_only_avx2(a8, b1, &t0, &t1); + return ghash_red_avx2(t0, t1); +} +#endif /* AES_GCM_AESNI_NO_UNROLL */ +#endif /* HAVE_INTEL_AVX2 */ + + /* See Intel® Carry-Less Multiplication Instruction * and its Usage for Computing the GCM Mode White Paper * by Shay Gueron, Intel Mobility Group, Israel Development Center; @@ -3632,7 +3674,196 @@ static const __m128i EIGHT = M128_INIT(0x0, 0x8); static const __m128i BSWAP_EPI64 = M128_INIT(0x0001020304050607, 0x08090a0b0c0d0e0f); static const __m128i BSWAP_MASK = M128_INIT(0x08090a0b0c0d0e0f, 0x0001020304050607); -static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, +static INLINE void aes_gcm_calc_iv_12(__m128i* KEY, const unsigned char* ivec, + int nr, __m128i* hp, __m128i* yp, + __m128i* tp, __m128i* xp) +{ + __m128i H, Y, T; + __m128i X = _mm_setzero_si128(); + __m128i lastKey; + __m128i tmp1, tmp2; + int j; + + Y = _mm_setzero_si128(); + for (j=0; j < 12; j++) + ((unsigned char*)&Y)[j] = ivec[j]; + Y = _mm_insert_epi32(Y, 0x1000000, 3); + + /* (Compute E[ZERO, KS] and E[Y0, KS] together */ + tmp1 = _mm_xor_si128(X, KEY[0]); + tmp2 = _mm_xor_si128(Y, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp2 = _mm_aesenc_si128(tmp2, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + lastKey = KEY[14]; + } + } + H = _mm_aesenclast_si128(tmp1, lastKey); + T = _mm_aesenclast_si128(tmp2, lastKey); + H = _mm_shuffle_epi8(H, BSWAP_MASK); + + *hp = H; + *yp = Y; + *tp = T; + *xp = X; +} + +static INLINE void aes_gcm_calc_iv(__m128i* KEY, const unsigned char* ivec, + unsigned int ibytes, int nr, __m128i* hp, + __m128i* yp, __m128i* tp, __m128i* xp) +{ + __m128i H, Y, T; + __m128i X = _mm_setzero_si128(); + __m128i lastKey; + __m128i last_block = _mm_setzero_si128(); + __m128i tmp1; + int i, j; + + if (ibytes % 16) { + i = ibytes / 16; + for (j=0; j < (int)(ibytes%16); j++) + ((unsigned char*)&last_block)[j] = ivec[i*16+j]; + } + tmp1 = _mm_xor_si128(X, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + H = _mm_aesenclast_si128(tmp1, lastKey); + H = _mm_shuffle_epi8(H, BSWAP_MASK); + Y = _mm_setzero_si128(); + for (i=0; i < (int)(ibytes/16); i++) { + tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + } + if (ibytes % 16) { + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + } + tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, 0, 1); + Y = _mm_xor_si128(Y, tmp1); + Y = gfmul_sw(Y, H); + Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ + tmp1 = _mm_xor_si128(Y, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + T = _mm_aesenclast_si128(tmp1, lastKey); + + *hp = H; + *yp = Y; + *tp = T; + *xp = X; +} + +#define AES_ENC_8(j) \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); \ + tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); \ + tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); \ + tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); \ + tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); \ + tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); \ + tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); + +#define AES_ENC_LAST_8() \ + tmp1 =_mm_aesenclast_si128(tmp1, lastKey); \ + tmp2 =_mm_aesenclast_si128(tmp2, lastKey); \ + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); \ + tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); \ + tmp3 =_mm_aesenclast_si128(tmp3, lastKey); \ + tmp4 =_mm_aesenclast_si128(tmp4, lastKey); \ + tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); \ + tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); \ + tmp5 =_mm_aesenclast_si128(tmp5, lastKey); \ + tmp6 =_mm_aesenclast_si128(tmp6, lastKey); \ + tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); \ + tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); \ + tmp7 =_mm_aesenclast_si128(tmp7, lastKey); \ + tmp8 =_mm_aesenclast_si128(tmp8, lastKey); \ + tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); \ + tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); + +void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + unsigned char *tag, unsigned int nbytes, + unsigned int abytes, unsigned int ibytes, + const unsigned char* key, int nr); +void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, unsigned int nbytes, @@ -3653,121 +3884,10 @@ static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; #endif - if (ibytes == 12) { - Y = _mm_setzero_si128(); - for (j=0; j < 12; j++) - ((unsigned char*)&Y)[j] = ivec[j]; - Y = _mm_insert_epi32(Y, 0x1000000, 3); - /* (Compute E[ZERO, KS] and E[Y0, KS] together */ - tmp1 = _mm_xor_si128(X, KEY[0]); - tmp2 = _mm_xor_si128(Y, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp2 = _mm_aesenc_si128(tmp2, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp2 = _mm_aesenc_si128(tmp2, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); - lastKey = KEY[14]; - } - } - H = _mm_aesenclast_si128(tmp1, lastKey); - T = _mm_aesenclast_si128(tmp2, lastKey); - H = _mm_shuffle_epi8(H, BSWAP_MASK); - } - else { - if (ibytes % 16) { - i = ibytes / 16; - for (j=0; j < (int)(ibytes%16); j++) - ((unsigned char*)&last_block)[j] = ivec[i*16+j]; - } - tmp1 = _mm_xor_si128(X, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - H = _mm_aesenclast_si128(tmp1, lastKey); - H = _mm_shuffle_epi8(H, BSWAP_MASK); - Y = _mm_setzero_si128(); - for (i=0; i < (int)(ibytes/16); i++) { - tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - } - if (ibytes % 16) { - tmp1 = last_block; - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - } - tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); - tmp1 = _mm_insert_epi64(tmp1, 0, 1); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ - tmp1 = _mm_xor_si128(Y, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - T = _mm_aesenclast_si128(tmp1, lastKey); - } + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, &H, &Y, &T, &X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, &H, &Y, &T, &X); for (i=0; i < (int)(abytes/16); i++) { tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); @@ -3825,141 +3945,27 @@ static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, tmp6 =_mm_xor_si128(tmp6, KEY[0]); tmp7 =_mm_xor_si128(tmp7, KEY[0]); tmp8 =_mm_xor_si128(tmp8, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); + AES_ENC_8(1); + AES_ENC_8(2); + AES_ENC_8(3); + AES_ENC_8(4); + AES_ENC_8(5); + AES_ENC_8(6); + AES_ENC_8(7); + AES_ENC_8(8); + AES_ENC_8(9); lastKey = KEY[10]; if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); + AES_ENC_8(10); + AES_ENC_8(11); lastKey = KEY[12]; if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); + AES_ENC_8(12); + AES_ENC_8(13); lastKey = KEY[14]; } } - tmp1 =_mm_aesenclast_si128(tmp1, lastKey); - tmp2 =_mm_aesenclast_si128(tmp2, lastKey); - tmp3 =_mm_aesenclast_si128(tmp3, lastKey); - tmp4 =_mm_aesenclast_si128(tmp4, lastKey); - tmp5 =_mm_aesenclast_si128(tmp5, lastKey); - tmp6 =_mm_aesenclast_si128(tmp6, lastKey); - tmp7 =_mm_aesenclast_si128(tmp7, lastKey); - tmp8 =_mm_aesenclast_si128(tmp8, lastKey); - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[0])); - tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[1])); - tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[2])); - tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[3])); - tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[4])); - tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[5])); - tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[6])); - tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[7])); - _mm_storeu_si128(&((__m128i*)out)[0], tmp1); - _mm_storeu_si128(&((__m128i*)out)[1], tmp2); - _mm_storeu_si128(&((__m128i*)out)[2], tmp3); - _mm_storeu_si128(&((__m128i*)out)[3], tmp4); - _mm_storeu_si128(&((__m128i*)out)[4], tmp5); - _mm_storeu_si128(&((__m128i*)out)[5], tmp6); - _mm_storeu_si128(&((__m128i*)out)[6], tmp7); - _mm_storeu_si128(&((__m128i*)out)[7], tmp8); + AES_ENC_LAST_8(); for (i=1; i < (int)(nbytes/16/8); i++) { r0 = _mm_setzero_si128(); @@ -4134,30 +4140,7 @@ static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, lastKey = KEY[14]; } } - tmp1 =_mm_aesenclast_si128(tmp1, lastKey); - tmp2 =_mm_aesenclast_si128(tmp2, lastKey); - tmp3 =_mm_aesenclast_si128(tmp3, lastKey); - tmp4 =_mm_aesenclast_si128(tmp4, lastKey); - tmp5 =_mm_aesenclast_si128(tmp5, lastKey); - tmp6 =_mm_aesenclast_si128(tmp6, lastKey); - tmp7 =_mm_aesenclast_si128(tmp7, lastKey); - tmp8 =_mm_aesenclast_si128(tmp8, lastKey); - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); - tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); - tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); - tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); - tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); - tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); - tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); - tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); - _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); - _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); - _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); - _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); - _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); - _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); - _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); - _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); + AES_ENC_LAST_8(); } tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); @@ -4315,46 +4298,367 @@ static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, _mm_storeu_si128((__m128i*)tag, T); } -#ifdef HAVE_INTEL_AVX2 +#ifdef HAVE_INTEL_AVX1 /* Encrypt with key in xmm12. */ -#define VAESENC() \ - "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" \ - "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" \ - "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" \ - "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" \ - "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" \ - "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" \ - "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" \ - "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" +#define VAESENC() \ + "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" \ + "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" \ + "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" \ + "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" \ + "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" \ + "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" + +#define VAESENC_SET(o) \ + "vmovaps "#o"(%[KEY]), %%xmm12\n\t" \ + VAESENC() + +#define VAESENC_CTR() \ + "vmovaps (%[pctr1]), %%xmm0\n\t" \ + "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" \ + "vpshufb %%xmm1, %%xmm0, %[tmp1]\n\t" \ + "vpaddd %[ONE], %%xmm0, %[tmp2]\n\t" \ + "vpshufb %%xmm1, %[tmp2], %[tmp2]\n\t" \ + "vpaddd %[TWO], %%xmm0, %[tmp3]\n\t" \ + "vpshufb %%xmm1, %[tmp3], %[tmp3]\n\t" \ + "vpaddd %[THREE], %%xmm0, %[tmp4]\n\t" \ + "vpshufb %%xmm1, %[tmp4], %[tmp4]\n\t" \ + "vpaddd %[FOUR], %%xmm0, %[tmp5]\n\t" \ + "vpshufb %%xmm1, %[tmp5], %[tmp5]\n\t" \ + "vpaddd %[FIVE], %%xmm0, %[tmp6]\n\t" \ + "vpshufb %%xmm1, %[tmp6], %[tmp6]\n\t" \ + "vpaddd %[SIX], %%xmm0, %[tmp7]\n\t" \ + "vpshufb %%xmm1, %[tmp7], %[tmp7]\n\t" \ + "vpaddd %[SEVEN], %%xmm0, %[tmp8]\n\t" \ + "vpshufb %%xmm1, %[tmp8], %[tmp8]\n\t" \ + "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" + +#define VAESENC_XOR() \ + "vmovaps (%[KEY]), %%xmm12\n\t" \ + "vmovaps %%xmm0, (%[pctr1])\n\t" \ + "vpxor %%xmm12, %[tmp1], %[tmp1]\n\t" \ + "vpxor %%xmm12, %[tmp2], %[tmp2]\n\t" \ + "vpxor %%xmm12, %[tmp3], %[tmp3]\n\t" \ + "vpxor %%xmm12, %[tmp4], %[tmp4]\n\t" \ + "vpxor %%xmm12, %[tmp5], %[tmp5]\n\t" \ + "vpxor %%xmm12, %[tmp6], %[tmp6]\n\t" \ + "vpxor %%xmm12, %[tmp7], %[tmp7]\n\t" \ + "vpxor %%xmm12, %[tmp8], %[tmp8]\n\t" -/* Encrypt and GCM mul with the nth round key. */ -#define VAESENC_PCLMUL_N(o1, o2, o3) \ - "vmovaps "#o1"(%[KEY]), %%xmm12\n\t" \ - "vmovdqu "#o2"(%[out]), %%xmm1\n\t" \ - "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" \ - "vmovaps "#o3"(%[HT]), %%xmm0\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" \ - "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" \ - "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" \ - "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" \ - "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" \ - "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" \ - "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" \ - "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" \ - "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" \ - "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" \ - "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" \ - "vpslldq $8, %%xmm13, %%xmm14\n\t" \ - "vpsrldq $8, %%xmm13, %%xmm13\n\t" \ - "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" \ - "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ - "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" \ - "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" +/* Encrypt and carry-less multiply for AVX1. */ +#define VAESENC_PCLMUL_1(src, o1, o2, o3) \ + "vmovdqa "#o3"(%[HT]), %%xmm12\n\t" \ + "vmovdqu "#o2"("#src"), %%xmm0\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm0, %%xmm0\n\t" \ + "vpxor %[XV], %%xmm0, %%xmm0\n\t" \ + "vpshufd $78, %%xmm12, %%xmm1\n\t" \ + "vpshufd $78, %%xmm0, %%xmm14\n\t" \ + "vpxor %%xmm12, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ + "vpclmulqdq $0x11, %%xmm12, %%xmm0, %%xmm3\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp2], %[tmp2]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp3], %[tmp3]\n\t" \ + "vpclmulqdq $0x00, %%xmm12, %%xmm0, %%xmm2\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp4], %[tmp4]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp5], %[tmp5]\n\t" \ + "vpclmulqdq $0x00, %%xmm14, %%xmm1, %%xmm1\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp6], %[tmp6]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp7], %[tmp7]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm3, %%xmm1, %%xmm1\n\t" \ + +#define VAESENC_PCLMUL_N(src, o1, o2, o3) \ + "vmovdqa "#o3"(%[HT]), %%xmm12\n\t" \ + "vmovdqu "#o2"("#src"), %%xmm0\n\t" \ + "vpshufd $78, %%xmm12, %%xmm13\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm0, %%xmm0\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vpxor %%xmm12, %%xmm13, %%xmm13\n\t" \ + "vpshufd $78, %%xmm0, %%xmm14\n\t" \ + "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ + "vpclmulqdq $0x11, %%xmm12, %%xmm0, %%xmm15\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp2], %[tmp2]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp3], %[tmp3]\n\t" \ + "vpclmulqdq $0x00, %%xmm12, %%xmm0, %%xmm12\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp4], %[tmp4]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp5], %[tmp5]\n\t" \ + "vpclmulqdq $0x00, %%xmm14, %%xmm13, %%xmm13\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp6], %[tmp6]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp7], %[tmp7]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm12, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ + "vpxor %%xmm15, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm15, %%xmm3, %%xmm3\n\t" \ + "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ -static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, +#define VAESENC_PCLMUL_L(o) \ + "vpslldq $8, %%xmm1, %%xmm14\n\t" \ + "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp2], %[tmp2]\n\t" \ + "vpslld $31, %%xmm2, %%xmm12\n\t" \ + "vpslld $30, %%xmm2, %%xmm13\n\t" \ + "vpslld $25, %%xmm2, %%xmm14\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp3], %[tmp3]\n\t" \ + "vpxor %%xmm13, %%xmm12, %%xmm12\n\t" \ + "vpxor %%xmm14, %%xmm12, %%xmm12\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp4], %[tmp4]\n\t" \ + "vpsrldq $4, %%xmm12, %%xmm13\n\t" \ + "vpslldq $12, %%xmm12, %%xmm12\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp5], %[tmp5]\n\t" \ + "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ + "vpsrld $1, %%xmm2, %%xmm14\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp6], %[tmp6]\n\t" \ + "vpsrld $2, %%xmm2, %%xmm1\n\t" \ + "vpsrld $7, %%xmm2, %%xmm0\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp7], %[tmp7]\n\t" \ + "vpxor %%xmm1, %%xmm14, %%xmm14\n\t" \ + "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm13, %%xmm14, %%xmm14\n\t" \ + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ + "vpxor %%xmm3, %%xmm2, %%xmm2\n\t" \ + + +/* Encrypt and carry-less multiply for AVX2. */ +#define VAESENC_PCLMUL_AVX2_1(src, o1, o2, o3) \ + "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ + "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ + "vpxor %[XV], %%xmm12, %%xmm12\n\t" \ + "vpclmulqdq $0x10, "#o3"(%[HT]), %%xmm12, %%xmm1\n\t" \ + "vaesenc %%xmm0, %[tmp1], %[tmp1]\n\t" \ + "vaesenc %%xmm0, %[tmp2], %[tmp2]\n\t" \ + "vpclmulqdq $0x01, "#o3"(%[HT]), %%xmm12, %%xmm14\n\t" \ + "vaesenc %%xmm0, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm0, %[tmp4], %[tmp4]\n\t" \ + "vpclmulqdq $0x00, "#o3"(%[HT]), %%xmm12, %%xmm2\n\t" \ + "vaesenc %%xmm0, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm0, %[tmp6], %[tmp6]\n\t" \ + "vpclmulqdq $0x11, "#o3"(%[HT]), %%xmm12, %%xmm3\n\t" \ + "vaesenc %%xmm0, %[tmp7], %[tmp7]\n\t" \ + "vaesenc %%xmm0, %[tmp8], %[tmp8]\n\t" \ + +#define VAESENC_PCLMUL_AVX2_2(src, o1, o2, o3) \ + "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ + "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ + "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ + "vpclmulqdq $0x10, "#o3"(%[HT]), %%xmm12, %%xmm13\n\t" \ + "vaesenc %%xmm0, %[tmp1], %[tmp1]\n\t" \ + "vaesenc %%xmm0, %[tmp2], %[tmp2]\n\t" \ + "vpclmulqdq $0x01, "#o3"(%[HT]), %%xmm12, %%xmm14\n\t" \ + "vaesenc %%xmm0, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm0, %[tmp4], %[tmp4]\n\t" \ + "vpclmulqdq $0x00, "#o3"(%[HT]), %%xmm12, %%xmm15\n\t" \ + "vaesenc %%xmm0, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm0, %[tmp6], %[tmp6]\n\t" \ + "vpclmulqdq $0x11, "#o3"(%[HT]), %%xmm12, %%xmm12\n\t" \ + "vaesenc %%xmm0, %[tmp7], %[tmp7]\n\t" \ + "vaesenc %%xmm0, %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm12, %%xmm3, %%xmm3\n\t" \ + "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ + +#define VAESENC_PCLMUL_AVX2_N(src, o1, o2, o3) \ + "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ + "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ + "vpclmulqdq $0x10, "#o3"(%[HT]), %%xmm12, %%xmm13\n\t" \ + "vaesenc %%xmm0, %[tmp1], %[tmp1]\n\t" \ + "vaesenc %%xmm0, %[tmp2], %[tmp2]\n\t" \ + "vpclmulqdq $0x01, "#o3"(%[HT]), %%xmm12, %%xmm14\n\t" \ + "vaesenc %%xmm0, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm0, %[tmp4], %[tmp4]\n\t" \ + "vpclmulqdq $0x00, "#o3"(%[HT]), %%xmm12, %%xmm15\n\t" \ + "vaesenc %%xmm0, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm0, %[tmp6], %[tmp6]\n\t" \ + "vpclmulqdq $0x11, "#o3"(%[HT]), %%xmm12, %%xmm12\n\t" \ + "vaesenc %%xmm0, %[tmp7], %[tmp7]\n\t" \ + "vaesenc %%xmm0, %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm12, %%xmm3, %%xmm3\n\t" \ + "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ + +#define VAESENC_PCLMUL_AVX2_L(o) \ + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ + "vpslldq $8, %%xmm1, %%xmm12\n\t" \ + "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ + "vmovdqa "#o"(%[KEY]), %%xmm15\n\t" \ + "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ + "vaesenc %%xmm15, %[tmp1], %[tmp1]\n\t" \ + "vmovdqa %[MOD2_128], %%xmm0\n\t" \ + "vpclmulqdq $0x10, %%xmm0, %%xmm2, %%xmm14\n\t" \ + "vaesenc %%xmm15, %[tmp2], %[tmp2]\n\t" \ + "vaesenc %%xmm15, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm15, %[tmp4], %[tmp4]\n\t" \ + "vpshufd $78, %%xmm2, %%xmm13\n\t" \ + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" \ + "vpclmulqdq $0x10, %%xmm0, %%xmm13, %%xmm14\n\t" \ + "vaesenc %%xmm15, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm15, %[tmp6], %[tmp6]\n\t" \ + "vaesenc %%xmm15, %[tmp7], %[tmp7]\n\t" \ + "vpshufd $78, %%xmm13, %%xmm13\n\t" \ + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" \ + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" \ + "vmovdqa %%xmm13, %%xmm2\n\t" \ + "vaesenc %%xmm15, %[tmp8], %[tmp8]\n\t" + + +/* Encrypt and carry-less multiply with last key. */ +#define VAESENC_LAST() \ + "vaesenclast %%xmm12, %[tmp1], %[tmp1]\n\t" \ + "vaesenclast %%xmm12, %[tmp2], %[tmp2]\n\t" \ + "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" \ + "vpxor 16(%[in]), %[tmp2], %[tmp2]\n\t" \ + "vmovdqu %[tmp1], (%[out])\n\t" \ + "vmovdqu %[tmp2], 16(%[out])\n\t" \ + "vaesenclast %%xmm12, %[tmp3], %[tmp3]\n\t" \ + "vaesenclast %%xmm12, %[tmp4], %[tmp4]\n\t" \ + "vpxor 32(%[in]), %[tmp3], %[tmp3]\n\t" \ + "vpxor 48(%[in]), %[tmp4], %[tmp4]\n\t" \ + "vmovdqu %[tmp3], 32(%[out])\n\t" \ + "vmovdqu %[tmp4], 48(%[out])\n\t" \ + "vaesenclast %%xmm12, %[tmp5], %[tmp5]\n\t" \ + "vaesenclast %%xmm12, %[tmp6], %[tmp6]\n\t" \ + "vpxor 64(%[in]), %[tmp5], %[tmp5]\n\t" \ + "vpxor 80(%[in]), %[tmp6], %[tmp6]\n\t" \ + "vmovdqu %[tmp5], 64(%[out])\n\t" \ + "vmovdqu %[tmp6], 80(%[out])\n\t" \ + "vaesenclast %%xmm12, %[tmp7], %[tmp7]\n\t" \ + "vaesenclast %%xmm12, %[tmp8], %[tmp8]\n\t" \ + "vpxor 96(%[in]), %[tmp7], %[tmp7]\n\t" \ + "vpxor 112(%[in]), %[tmp8], %[tmp8]\n\t" \ + "vmovdqu %[tmp7], 96(%[out])\n\t" \ + "vmovdqu %[tmp8], 112(%[out])\n\t" + +#define VAESENC_BLOCK() \ + "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" \ + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" \ + "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "cmpl $11, %[nr]\n\t" \ + "vmovaps 160(%[KEY]), %[tmp2]\n\t" \ + "jl %=f\n\t" \ + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" \ + "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "cmpl $13, %[nr]\n\t" \ + "vmovaps 192(%[KEY]), %[tmp2]\n\t" \ + "jl %=f\n\t" \ + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" \ + "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vmovaps 224(%[KEY]), %[tmp2]\n\t" \ + "%=:\n\t" \ + "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" \ + "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" \ + "vmovdqu %[tmp1], (%[out])\n\t" \ + "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" \ + "vpxor %[tmp1], %[X], %[X]\n\t" + +static INLINE void aes_gcm_avx1_calc_iv_12(__m128i* KEY, + const unsigned char* ivec, int nr, + __m128i* hp, __m128i* yp, + __m128i* tp, __m128i* xp) +{ + register __m128i H; + register __m128i T; + register __m128i X = _mm_setzero_si128(); + __m128i Y = _mm_setzero_si128(); + int j; + + for (j=0; j < 12; j++) + ((unsigned char*)&Y)[j] = ivec[j]; + Y = _mm_insert_epi32(Y, 0x1000000, 3); + + __asm__ __volatile__ ( + "vmovaps 0(%[KEY]), %%xmm5\n\t" + "vmovaps 16(%[KEY]), %%xmm6\n\t" + "vpxor %%xmm5, %[X], %[H]\n\t" + "vpxor %%xmm5, %[Y], %[T]\n\t" + "vaesenc %%xmm6, %[H], %[H]\n\t" + "vaesenc %%xmm6, %[T], %[T]\n\t" + "vmovaps 32(%[KEY]), %%xmm5\n\t" + "vmovaps 48(%[KEY]), %%xmm6\n\t" + "vaesenc %%xmm5, %[H], %[H]\n\t" + "vaesenc %%xmm5, %[T], %[T]\n\t" + "vaesenc %%xmm6, %[H], %[H]\n\t" + "vaesenc %%xmm6, %[T], %[T]\n\t" + "vmovaps 64(%[KEY]), %%xmm5\n\t" + "vmovaps 80(%[KEY]), %%xmm6\n\t" + "vaesenc %%xmm5, %[H], %[H]\n\t" + "vaesenc %%xmm5, %[T], %[T]\n\t" + "vaesenc %%xmm6, %[H], %[H]\n\t" + "vaesenc %%xmm6, %[T], %[T]\n\t" + "vmovaps 96(%[KEY]), %%xmm5\n\t" + "vmovaps 112(%[KEY]), %%xmm6\n\t" + "vaesenc %%xmm5, %[H], %[H]\n\t" + "vaesenc %%xmm5, %[T], %[T]\n\t" + "vaesenc %%xmm6, %[H], %[H]\n\t" + "vaesenc %%xmm6, %[T], %[T]\n\t" + "vmovaps 128(%[KEY]), %%xmm5\n\t" + "vmovaps 144(%[KEY]), %%xmm6\n\t" + "vaesenc %%xmm5, %[H], %[H]\n\t" + "vaesenc %%xmm5, %[T], %[T]\n\t" + "vaesenc %%xmm6, %[H], %[H]\n\t" + "vaesenc %%xmm6, %[T], %[T]\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm5\n\t" + "jl %=f\n\t" + "vaesenc %%xmm5, %[H], %[H]\n\t" + "vaesenc %%xmm5, %[T], %[T]\n\t" + "vmovaps 176(%[KEY]), %%xmm6\n\t" + "vmovaps 192(%[KEY]), %%xmm5\n\t" + "vaesenc %%xmm6, %[H], %[H]\n\t" + "vaesenc %%xmm6, %[T], %[T]\n\t" + "cmpl $13, %[nr]\n\t" + "jl %=f\n\t" + "vaesenc %%xmm5, %[H], %[H]\n\t" + "vaesenc %%xmm5, %[T], %[T]\n\t" + "vmovaps 208(%[KEY]), %%xmm6\n\t" + "vmovaps 224(%[KEY]), %%xmm5\n\t" + "vaesenc %%xmm6, %[H], %[H]\n\t" + "vaesenc %%xmm6, %[T], %[T]\n\t" + "%=:\n\t" + "vaesenclast %%xmm5, %[H], %[H]\n\t" + "vaesenclast %%xmm5, %[T], %[T]\n\t" + "vpshufb %[BSWAP_MASK], %[H], %[H]\n\t" + + : [H] "=xr" (H), [Y] "+xr" (Y), [T] "=xr" (T), [X] "+xr" (X) + : [KEY] "r" (KEY), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK) + : "memory", "xmm5", "xmm6" + ); + + *hp = H; + *yp = Y; + *tp = T; + *xp = X; +} + +void AES_GCM_encrypt_avx1(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + unsigned char *tag, unsigned int nbytes, + unsigned int abytes, unsigned int ibytes, + const unsigned char* key, int nr); +void AES_GCM_encrypt_avx1(const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, unsigned int nbytes, @@ -4367,7 +4671,7 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, __m128i X = _mm_setzero_si128(); __m128i *KEY = (__m128i*)key, lastKey; __m128i last_block = _mm_setzero_si128(); -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) __m128i HT[8]; register __m128i tmp1 asm("xmm4"); register __m128i tmp2 asm("xmm5"); @@ -4383,121 +4687,10 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, __m128i tmp1, tmp2; #endif - if (ibytes == 12) { - Y = _mm_setzero_si128(); - for (j=0; j < 12; j++) - ((unsigned char*)&Y)[j] = ivec[j]; - Y = _mm_insert_epi32(Y, 0x1000000, 3); - /* (Compute E[ZERO, KS] and E[Y0, KS] together */ - tmp1 = _mm_xor_si128(X, KEY[0]); - tmp2 = _mm_xor_si128(Y, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp2 = _mm_aesenc_si128(tmp2, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp2 = _mm_aesenc_si128(tmp2, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); - lastKey = KEY[14]; - } - } - H = _mm_aesenclast_si128(tmp1, lastKey); - T = _mm_aesenclast_si128(tmp2, lastKey); - H = _mm_shuffle_epi8(H, BSWAP_MASK); - } - else { - if (ibytes % 16) { - i = ibytes / 16; - for (j=0; j < (int)(ibytes%16); j++) - ((unsigned char*)&last_block)[j] = ivec[i*16+j]; - } - tmp1 = _mm_xor_si128(X, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - H = _mm_aesenclast_si128(tmp1, lastKey); - H = _mm_shuffle_epi8(H, BSWAP_MASK); - Y = _mm_setzero_si128(); - for (i=0; i < (int)(ibytes/16); i++) { - tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - } - if (ibytes % 16) { - tmp1 = last_block; - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - } - tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); - tmp1 = _mm_insert_epi64(tmp1, 0, 1); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ - tmp1 = _mm_xor_si128(Y, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - T = _mm_aesenclast_si128(tmp1, lastKey); - } + if (ibytes == 12) + aes_gcm_avx1_calc_iv_12(KEY, ivec, nr, &H, &Y, &T, &X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, &H, &Y, &T, &X); for (i=0; i < (int)(abytes/16); i++) { tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); @@ -4519,7 +4712,7 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, ctr1 = _mm_add_epi32(tmp1, ONE); H = gfmul_shl1(H); -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) i = 0; if (nbytes >= 16*8) { HT[0] = H; @@ -4533,95 +4726,33 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, pctr1[0] = ctr1; __asm__ __volatile__ ( - "vmovaps (%[pctr1]), %%xmm0\n\t" - "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" - "vpshufb %%xmm1, %%xmm0, %[tmp1]\n\t" - "vpaddd %[ONE], %%xmm0, %[tmp2]\n\t" - "vpshufb %%xmm1, %[tmp2], %[tmp2]\n\t" - "vpaddd %[TWO], %%xmm0, %[tmp3]\n\t" - "vpshufb %%xmm1, %[tmp3], %[tmp3]\n\t" - "vpaddd %[THREE], %%xmm0, %[tmp4]\n\t" - "vpshufb %%xmm1, %[tmp4], %[tmp4]\n\t" - "vpaddd %[FOUR], %%xmm0, %[tmp5]\n\t" - "vpshufb %%xmm1, %[tmp5], %[tmp5]\n\t" - "vpaddd %[FIVE], %%xmm0, %[tmp6]\n\t" - "vpshufb %%xmm1, %[tmp6], %[tmp6]\n\t" - "vpaddd %[SIX], %%xmm0, %[tmp7]\n\t" - "vpshufb %%xmm1, %[tmp7], %[tmp7]\n\t" - "vpaddd %[SEVEN], %%xmm0, %[tmp8]\n\t" - "vpshufb %%xmm1, %[tmp8], %[tmp8]\n\t" - "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" - - "vmovaps (%[KEY]), %%xmm1\n\t" - "vmovaps %%xmm0, (%[pctr1])\n\t" - "vpxor %%xmm1, %[tmp1], %[tmp1]\n\t" - "vpxor %%xmm1, %[tmp2], %[tmp2]\n\t" - "vpxor %%xmm1, %[tmp3], %[tmp3]\n\t" - "vpxor %%xmm1, %[tmp4], %[tmp4]\n\t" - "vpxor %%xmm1, %[tmp5], %[tmp5]\n\t" - "vpxor %%xmm1, %[tmp6], %[tmp6]\n\t" - "vpxor %%xmm1, %[tmp7], %[tmp7]\n\t" - "vpxor %%xmm1, %[tmp8], %[tmp8]\n\t" - - "vmovaps 16(%[KEY]), %%xmm12\n\t" - VAESENC() - "vmovaps 32(%[KEY]), %%xmm12\n\t" - VAESENC() - "vmovaps 48(%[KEY]), %%xmm12\n\t" - VAESENC() - "vmovaps 64(%[KEY]), %%xmm12\n\t" - VAESENC() - "vmovaps 80(%[KEY]), %%xmm12\n\t" - VAESENC() - "vmovaps 96(%[KEY]), %%xmm12\n\t" - VAESENC() - "vmovaps 112(%[KEY]), %%xmm12\n\t" - VAESENC() - "vmovaps 128(%[KEY]), %%xmm12\n\t" - VAESENC() - "vmovaps 144(%[KEY]), %%xmm12\n\t" - VAESENC() + VAESENC_CTR() + VAESENC_XOR() + VAESENC_SET(16) + VAESENC_SET(32) + VAESENC_SET(48) + VAESENC_SET(64) + VAESENC_SET(80) + VAESENC_SET(96) + VAESENC_SET(112) + VAESENC_SET(128) + VAESENC_SET(144) "cmpl $11, %[nr]\n\t" "vmovaps 160(%[KEY]), %%xmm12\n\t" - "jl L_enc128_enclast\n\t" + "jl 1f\n\t" VAESENC() - "vmovaps 176(%[KEY]), %%xmm12\n\t" - VAESENC() + VAESENC_SET(176) "cmpl $13, %[nr]\n\t" "vmovaps 192(%[KEY]), %%xmm12\n\t" - "jl L_enc128_enclast\n\t" + "jl 1f\n\t" VAESENC() - "vmovaps 208(%[KEY]), %%xmm12\n\t" - VAESENC() + VAESENC_SET(208) "vmovaps 224(%[KEY]), %%xmm12\n\t" "\n" - "L_enc128_enclast:\n\t" - "vaesenclast %%xmm12, %[tmp1], %[tmp1]\n\t" - "vaesenclast %%xmm12, %[tmp2], %[tmp2]\n\t" - "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" - "vpxor 16(%[in]), %[tmp2], %[tmp2]\n\t" - "vmovdqu %[tmp1], (%[out])\n\t" - "vmovdqu %[tmp2], 16(%[out])\n\t" - "vaesenclast %%xmm12, %[tmp3], %[tmp3]\n\t" - "vaesenclast %%xmm12, %[tmp4], %[tmp4]\n\t" - "vpxor 32(%[in]), %[tmp3], %[tmp3]\n\t" - "vpxor 48(%[in]), %[tmp4], %[tmp4]\n\t" - "vmovdqu %[tmp3], 32(%[out])\n\t" - "vmovdqu %[tmp4], 48(%[out])\n\t" - "vaesenclast %%xmm12, %[tmp5], %[tmp5]\n\t" - "vaesenclast %%xmm12, %[tmp6], %[tmp6]\n\t" - "vpxor 64(%[in]), %[tmp5], %[tmp5]\n\t" - "vpxor 80(%[in]), %[tmp6], %[tmp6]\n\t" - "vmovdqu %[tmp5], 64(%[out])\n\t" - "vmovdqu %[tmp6], 80(%[out])\n\t" - "vaesenclast %%xmm12, %[tmp7], %[tmp7]\n\t" - "vaesenclast %%xmm12, %[tmp8], %[tmp8]\n\t" - "vpxor 96(%[in]), %[tmp7], %[tmp7]\n\t" - "vpxor 112(%[in]), %[tmp8], %[tmp8]\n\t" - "vmovdqu %[tmp7], 96(%[out])\n\t" - "vmovdqu %[tmp8], 112(%[out])\n\t" + "1:\n\t" + VAESENC_LAST() : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), @@ -4640,126 +4771,34 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, XV = X; for (i=1; i < (int)(nbytes/16/8); i++) { __asm__ __volatile__ ( - "vmovaps (%[pctr1]), %%xmm0\n\t" - "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" - "vpshufb %%xmm1, %%xmm0, %[tmp1]\n\t" - "vpaddd %[ONE], %%xmm0, %[tmp2]\n\t" - "vpshufb %%xmm1, %[tmp2], %[tmp2]\n\t" - "vpaddd %[TWO], %%xmm0, %[tmp3]\n\t" - "vpshufb %%xmm1, %[tmp3], %[tmp3]\n\t" - "vpaddd %[THREE], %%xmm0, %[tmp4]\n\t" - "vpshufb %%xmm1, %[tmp4], %[tmp4]\n\t" - "vpaddd %[FOUR], %%xmm0, %[tmp5]\n\t" - "vpshufb %%xmm1, %[tmp5], %[tmp5]\n\t" - "vpaddd %[FIVE], %%xmm0, %[tmp6]\n\t" - "vpshufb %%xmm1, %[tmp6], %[tmp6]\n\t" - "vpaddd %[SIX], %%xmm0, %[tmp7]\n\t" - "vpshufb %%xmm1, %[tmp7], %[tmp7]\n\t" - "vpaddd %[SEVEN], %%xmm0, %[tmp8]\n\t" - "vpshufb %%xmm1, %[tmp8], %[tmp8]\n\t" - "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" + VAESENC_CTR() + VAESENC_XOR() + VAESENC_PCLMUL_1(%[out], 16, -128, 112) + VAESENC_PCLMUL_N(%[out], 32, -112, 96) + VAESENC_PCLMUL_N(%[out], 48, -96, 80) + VAESENC_PCLMUL_N(%[out], 64, -80, 64) + VAESENC_PCLMUL_N(%[out], 80, -64, 48) + VAESENC_PCLMUL_N(%[out], 96, -48, 32) + VAESENC_PCLMUL_N(%[out], 112, -32, 16) + VAESENC_PCLMUL_N(%[out], 128, -16, 0) + VAESENC_PCLMUL_L(144) - "vmovaps (%[KEY]), %%xmm1\n\t" - "vmovaps %%xmm0, (%[pctr1])\n\t" - "vpxor %%xmm1, %[tmp1], %[tmp1]\n\t" - "vpxor %%xmm1, %[tmp2], %[tmp2]\n\t" - "vpxor %%xmm1, %[tmp3], %[tmp3]\n\t" - "vpxor %%xmm1, %[tmp4], %[tmp4]\n\t" - "vpxor %%xmm1, %[tmp5], %[tmp5]\n\t" - "vpxor %%xmm1, %[tmp6], %[tmp6]\n\t" - "vpxor %%xmm1, %[tmp7], %[tmp7]\n\t" - "vpxor %%xmm1, %[tmp8], %[tmp8]\n\t" - - "vmovaps 16(%[KEY]), %%xmm12\n\t" - "vmovdqu -128(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" - "vmovaps 112(%[HT]), %%xmm0\n\t" - "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vpxor %[XV], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" - "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" - "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" - "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" - "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" - "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" - "vpslldq $8, %%xmm13, %%xmm2\n\t" - "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" - "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" - "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" - "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" - - VAESENC_PCLMUL_N( 32, -112, 96) - VAESENC_PCLMUL_N( 48, -96, 80) - VAESENC_PCLMUL_N( 64, -80, 64) - VAESENC_PCLMUL_N( 80, -64, 48) - VAESENC_PCLMUL_N( 96, -48, 32) - VAESENC_PCLMUL_N(112, -32, 16) - VAESENC_PCLMUL_N(128, -16, 0) - - "vmovaps 144(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" - "vmovdqa %[MOD2_128], %%xmm0\n\t" - "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" - "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" - "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" - "vpshufd $78, %%xmm2, %%xmm13\n\t" - "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" - "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" - "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" - "vpshufd $78, %%xmm13, %%xmm13\n\t" - "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" - "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" - "vmovdqa %%xmm13, %%xmm2\n\t" - "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" - "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" "cmpl $11, %[nr]\n\t" "vmovaps 160(%[KEY]), %%xmm12\n\t" "jl %=f\n\t" VAESENC() - "vmovaps 176(%[KEY]), %%xmm12\n\t" - VAESENC() + VAESENC_SET(176) "cmpl $13, %[nr]\n\t" "vmovaps 192(%[KEY]), %%xmm12\n\t" "jl %=f\n\t" VAESENC() - "vmovaps 208(%[KEY]), %%xmm12\n\t" - VAESENC() + VAESENC_SET(208) "vmovaps 224(%[KEY]), %%xmm12\n\t" "%=:\n\t" - "vaesenclast %%xmm12, %[tmp1], %[tmp1]\n\t" - "vaesenclast %%xmm12, %[tmp2], %[tmp2]\n\t" - "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" - "vpxor 16(%[in]), %[tmp2], %[tmp2]\n\t" - "vmovdqu %[tmp1], (%[out])\n\t" - "vmovdqu %[tmp2], 16(%[out])\n\t" - "vaesenclast %%xmm12, %[tmp3], %[tmp3]\n\t" - "vaesenclast %%xmm12, %[tmp4], %[tmp4]\n\t" - "vpxor 32(%[in]), %[tmp3], %[tmp3]\n\t" - "vpxor 48(%[in]), %[tmp4], %[tmp4]\n\t" - "vmovdqu %[tmp3], 32(%[out])\n\t" - "vmovdqu %[tmp4], 48(%[out])\n\t" - "vaesenclast %%xmm12, %[tmp5], %[tmp5]\n\t" - "vaesenclast %%xmm12, %[tmp6], %[tmp6]\n\t" - "vpxor 64(%[in]), %[tmp5], %[tmp5]\n\t" - "vpxor 80(%[in]), %[tmp6], %[tmp6]\n\t" - "vmovdqu %[tmp5], 64(%[out])\n\t" - "vmovdqu %[tmp6], 80(%[out])\n\t" - "vaesenclast %%xmm12, %[tmp7], %[tmp7]\n\t" - "vaesenclast %%xmm12, %[tmp8], %[tmp8]\n\t" - "vpxor 96(%[in]), %[tmp7], %[tmp7]\n\t" - "vpxor 112(%[in]), %[tmp8], %[tmp8]\n\t" - "vmovdqu %[tmp7], 96(%[out])\n\t" - "vmovdqu %[tmp8], 112(%[out])\n\t" + VAESENC_LAST() : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), @@ -4794,36 +4833,8 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, } for (k = i*8; k < (int)(nbytes/16); k++) { __asm__ __volatile__ ( - "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" - "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" - "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" - "cmpl $11, %[nr]\n\t" - "vmovaps 160(%[KEY]), %[tmp2]\n\t" - "jl %=f\n\t" - "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" - "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" - "cmpl $13, %[nr]\n\t" - "vmovaps 192(%[KEY]), %[tmp2]\n\t" - "jl %=f\n\t" - "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" - "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vmovaps 224(%[KEY]), %[tmp2]\n\t" - "%=:\n\t" - "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" - "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" - "vmovdqu %[tmp1], (%[out])\n\t" - "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" + VAESENC_BLOCK() - "vpxor %[tmp1], %[X], %[X]\n\t" "# Carryless Multiply X by H (128 x 128)\n\t" "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" @@ -4861,35 +4872,7 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, #else for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { __asm__ __volatile__ ( - "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" - "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" - "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" - "cmpl $11, %[nr]\n\t" - "vmovaps 160(%[KEY]), %[tmp2]\n\t" - "jl %=f\n\t" - "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" - "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" - "cmpl $13, %[nr]\n\t" - "vmovaps 192(%[KEY]), %[tmp2]\n\t" - "jl %=f\n\t" - "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" - "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" - "vmovaps 224(%[KEY]), %[tmp2]\n\t" - "%=:\n\t" - "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" - "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" - "vmovdqu %[tmp1], (%[out])\n\t" - "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" - "vpxor %[tmp1], %[X], %[X]\n\t" + VAESENC_BLOCK() : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) @@ -5013,7 +4996,348 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, T = _mm_xor_si128(X, T); _mm_storeu_si128((__m128i*)tag, T); } + +#ifdef HAVE_INTEL_AVX2 +static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + unsigned char *tag, unsigned int nbytes, + unsigned int abytes, unsigned int ibytes, + const unsigned char* key, int nr) +{ + int i, j ,k; + __m128i ctr1; + __m128i H, Y, T; + __m128i X = _mm_setzero_si128(); + __m128i *KEY = (__m128i*)key, lastKey; + __m128i last_block = _mm_setzero_si128(); +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) + __m128i HT[8]; + register __m128i tmp1 asm("xmm4"); + register __m128i tmp2 asm("xmm5"); + register __m128i tmp3 asm("xmm6"); + register __m128i tmp4 asm("xmm7"); + register __m128i tmp5 asm("xmm8"); + register __m128i tmp6 asm("xmm9"); + register __m128i tmp7 asm("xmm10"); + register __m128i tmp8 asm("xmm11"); + __m128i pctr1[1]; + register __m128i XV asm("xmm2"); +#else + __m128i tmp1, tmp2; +#endif + + if (ibytes == 12) + aes_gcm_avx1_calc_iv_12(KEY, ivec, nr, &H, &Y, &T, &X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, &H, &Y, &T, &X); + + for (i=0; i < (int)(abytes/16); i++) { + tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw_avx2(X, H); + } + if (abytes%16) { + last_block = _mm_setzero_si128(); + for (j=0; j < (int)(abytes%16); j++) + ((unsigned char*)&last_block)[j] = addt[i*16+j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw_avx2(X, H); + } + + tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); + ctr1 = _mm_add_epi32(tmp1, ONE); + H = gfmul_shl1(H); + +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) + i = 0; + if (nbytes >= 16*8) { + HT[0] = H; + HT[1] = gfmul_shifted_avx2(H, H); + HT[2] = gfmul_shifted_avx2(H, HT[1]); + HT[3] = gfmul_shifted_avx2(HT[1], HT[1]); + HT[4] = gfmul_shifted_avx2(HT[1], HT[2]); + HT[5] = gfmul_shifted_avx2(HT[2], HT[2]); + HT[6] = gfmul_shifted_avx2(HT[2], HT[3]); + HT[7] = gfmul_shifted_avx2(HT[3], HT[3]); + + pctr1[0] = ctr1; + __asm__ __volatile__ ( + VAESENC_CTR() + VAESENC_XOR() + VAESENC_SET(16) + VAESENC_SET(32) + VAESENC_SET(48) + VAESENC_SET(64) + VAESENC_SET(80) + VAESENC_SET(96) + VAESENC_SET(112) + VAESENC_SET(128) + VAESENC_SET(144) + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl 1f\n\t" + + VAESENC() + VAESENC_SET(176) + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl 1f\n\t" + + VAESENC() + VAESENC_SET(208) + "vmovaps 224(%[KEY]), %%xmm12\n\t" + "\n" + "1:\n\t" + VAESENC_LAST() + + : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), + [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), + [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8) + : [KEY] "r" (KEY), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), [TWO] "m" (TWO), + [THREE] "m" (THREE), [FOUR] "m" (FOUR), + [FIVE] "m" (FIVE), [SIX] "m" (SIX), + [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + + XV = X; + for (i=1; i < (int)(nbytes/16/8); i++) { + __asm__ __volatile__ ( + VAESENC_CTR() + VAESENC_XOR() + VAESENC_PCLMUL_AVX2_1(%[out], 16, -128, 112) + VAESENC_PCLMUL_AVX2_2(%[out], 32, -112, 96) + VAESENC_PCLMUL_AVX2_N(%[out], 48, -96, 80) + VAESENC_PCLMUL_AVX2_N(%[out], 64, -80, 64) + VAESENC_PCLMUL_AVX2_N(%[out], 80, -64, 48) + VAESENC_PCLMUL_AVX2_N(%[out], 96, -48, 32) + VAESENC_PCLMUL_AVX2_N(%[out], 112, -32, 16) + VAESENC_PCLMUL_AVX2_N(%[out], 128, -16, 0) + VAESENC_PCLMUL_AVX2_L(144) + + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + VAESENC() + VAESENC_SET(176) + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + VAESENC() + VAESENC_SET(208) + "vmovaps 224(%[KEY]), %%xmm12\n\t" + + "%=:\n\t" + VAESENC_LAST() + + : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), + [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), + [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8), + [XV] "+xr" (XV) + : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), [TWO] "m" (TWO), + [THREE] "m" (THREE), [FOUR] "m" (FOUR), + [FIVE] "m" (FIVE), [SIX] "m" (SIX), + [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT), + [MOD2_128] "m" (MOD2_128) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + } + X = XV; + ctr1 = pctr1[0]; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); + tmp1 = _mm_xor_si128(X, tmp1); + X = gfmul8_avx2(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, + HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); + } + for (k = i*8; k < (int)(nbytes/16); k++) { + __asm__ __volatile__ ( + VAESENC_BLOCK() + + "# Carryless Multiply X by H (128 x 128)\n\t" + "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" + "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" + "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" + "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "# Reduce\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "# End Reduce\n\t" + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), + [MOD2_128] "m" (MOD2_128) + : "xmm15", "xmm14", "xmm13", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } +#else + for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { + __asm__ __volatile__ ( + VAESENC_BLOCK() + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), + [MOD2_128] "m" (MOD2_128) + : "memory" + ); + } + for (; k < (int)(nbytes/16); k++) { + __asm__ __volatile__ ( + "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" + "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" + "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" + "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" + "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" + "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" + "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %[tmp2]\n\t" + "jl %=f\n\t" + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" + "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %[tmp2]\n\t" + "jl %=f\n\t" + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" + "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vmovaps 224(%[KEY]), %[tmp2]\n\t" + "%=:\n\t" + "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" + "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" + "vmovdqu %[tmp1], (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" + "vpxor %[tmp1], %[X], %[X]\n\t" + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), + [MOD2_128] "m" (MOD2_128) + : "xmm15", "xmm14", "xmm13", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } + if (k > 0) { + X = gfmul_shifted_avx2(X, H); + } +#endif + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + X = gfmul_shifted_avx2(X, H); + } + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted_avx2(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + _mm_storeu_si128((__m128i*)tag, T); +} #endif /* HAVE_INTEL_AVX2 */ +#endif /* HAVE_INTEL_AVX1 */ #ifdef HAVE_AES_DECRYPT /* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */ @@ -5036,122 +5360,10 @@ static int AES_GCM_decrypt(const unsigned char *in, unsigned char *out, __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; #endif - if (ibytes == 12) { - Y = _mm_setzero_si128(); - for (j=0; j < 12; j++) - ((unsigned char*)&Y)[j] = ivec[j]; - Y = _mm_insert_epi32(Y, 0x1000000, 3); - /* (Compute E[ZERO, KS] and E[Y0, KS] together */ - tmp1 = _mm_xor_si128(X, KEY[0]); - tmp2 = _mm_xor_si128(Y, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp2 = _mm_aesenc_si128(tmp2, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp2 = _mm_aesenc_si128(tmp2, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); - lastKey = KEY[14]; - } - } - H = _mm_aesenclast_si128(tmp1, lastKey); - T = _mm_aesenclast_si128(tmp2, lastKey); - H = _mm_shuffle_epi8(H, BSWAP_MASK); - } - else { - if (ibytes % 16) { - i = ibytes / 16; - for (j=0; j < ibytes%16; j++) - ((unsigned char*)&last_block)[j] = ivec[i*16+j]; - } - tmp1 = _mm_xor_si128(X, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - H = _mm_aesenclast_si128(tmp1, lastKey); - H = _mm_shuffle_epi8(H, BSWAP_MASK); - - Y = _mm_setzero_si128(); - for (i=0; i < ibytes/16; i++) { - tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - } - if (ibytes % 16) { - tmp1 = last_block; - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - } - tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); - tmp1 = _mm_insert_epi64(tmp1, 0, 1); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ - tmp1 = _mm_xor_si128(Y, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - T = _mm_aesenclast_si128(tmp1, lastKey); - } + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, &H, &Y, &T, &X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, &H, &Y, &T, &X); for (i=0; i 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp2 = _mm_aesenc_si128(tmp2, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp2 = _mm_aesenc_si128(tmp2, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); - lastKey = KEY[14]; - } - } - H = _mm_aesenclast_si128(tmp1, lastKey); - T = _mm_aesenclast_si128(tmp2, lastKey); - H = _mm_shuffle_epi8(H, BSWAP_MASK); - } - else { - if (ibytes % 16) { - i = ibytes / 16; - for (j=0; j < ibytes%16; j++) - ((unsigned char*)&last_block)[j] = ivec[i*16+j]; - } - tmp1 = _mm_xor_si128(X, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - H = _mm_aesenclast_si128(tmp1, lastKey); - H = _mm_shuffle_epi8(H, BSWAP_MASK); - - Y = _mm_setzero_si128(); - for (i=0; i < ibytes/16; i++) { - tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - } - if (ibytes % 16) { - tmp1 = last_block; - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - } - tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); - tmp1 = _mm_insert_epi64(tmp1, 0, 1); - Y = _mm_xor_si128(Y, tmp1); - Y = gfmul_sw(Y, H); - Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ - tmp1 = _mm_xor_si128(Y, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - T = _mm_aesenclast_si128(tmp1, lastKey); - } + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, &H, &Y, &T, &X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, &H, &Y, &T, &X); for (i=0; i 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = _mm_setzero_si128(); + for (j=0; j < nbytes%16; j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + XV = last_block; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < nbytes%16; j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted_avx2(XV, H); + } + + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + /* 128 x 128 Carryless Multiply */ + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted_avx2(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + + if (0xffff != + _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) + return 0; /* in case the authentication failed */ + + return 1; /* when successful returns 1 */ +} #endif /* HAVE_INTEL_AVX2 */ +#endif /* HAVE_INTEL_AVX1 */ #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AESNI */ @@ -6793,11 +6841,15 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, #ifdef WOLFSSL_AESNI if (haveAESNI) { - #ifdef HAVE_INTEL_AVX2 + #ifdef HAVE_INTEL_AVX1 if (IS_INTEL_AVX2(intel_flags)) { AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); } + else if (IS_INTEL_AVX1(intel_flags)) { + AES_GCM_encrypt_avx1(in, out, authIn, iv, authTag, + sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); + } else #endif AES_GCM_encrypt(in, out, authIn, iv, authTag, @@ -7055,11 +7107,16 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, #ifdef WOLFSSL_AESNI if (haveAESNI) { - #ifdef HAVE_INTEL_AVX2 + #ifdef HAVE_INTEL_AVX1 if (IS_INTEL_AVX2(intel_flags)) { if (AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, ivSz, (byte*)aes->key, aes->rounds) == 0) - return AES_GCM_AUTH_E; + return AES_GCM_AUTH_E; + } + else if (IS_INTEL_AVX1(intel_flags)) { + if (AES_GCM_decrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, + ivSz, (byte*)aes->key, aes->rounds) == 0) + return AES_GCM_AUTH_E; } else #endif