diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 6fd55814f..bcebe14af 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -3537,6 +3537,241 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) static const __m128i MOD2_128 = M128_INIT(0x1, 0xc200000000000000UL); +static __m128i gfmul_sw(__m128i a, __m128i b) +{ + __m128i r, t1, t2, t3, t4, t5, t6, t7; + t2 = _mm_shuffle_epi32(b, 78); + t3 = _mm_shuffle_epi32(a, 78); + t2 = _mm_xor_si128(t2, b); + t3 = _mm_xor_si128(t3, a); + t4 = _mm_clmulepi64_si128(b, a, 0x11); + t1 = _mm_clmulepi64_si128(b, a, 0x00); + t2 = _mm_clmulepi64_si128(t2, t3, 0x00); + t2 = _mm_xor_si128(t2, t1); + t2 = _mm_xor_si128(t2, t4); + t3 = _mm_slli_si128(t2, 8); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + t4 = _mm_xor_si128(t4, t2); + + t5 = _mm_srli_epi32(t1, 31); + t6 = _mm_srli_epi32(t4, 31); + t1 = _mm_slli_epi32(t1, 1); + t4 = _mm_slli_epi32(t4, 1); + t7 = _mm_srli_si128(t5, 12); + t5 = _mm_slli_si128(t5, 4); + t6 = _mm_slli_si128(t6, 4); + t4 = _mm_or_si128(t4, t7); + t1 = _mm_or_si128(t1, t5); + t4 = _mm_or_si128(t4, t6); + + t5 = _mm_slli_epi32(t1, 31); + t6 = _mm_slli_epi32(t1, 30); + t7 = _mm_slli_epi32(t1, 25); + t5 = _mm_xor_si128(t5, t6); + t5 = _mm_xor_si128(t5, t7); + + t6 = _mm_srli_si128(t5, 4); + t5 = _mm_slli_si128(t5, 12); + t1 = _mm_xor_si128(t1, t5); + t7 = _mm_srli_epi32(t1, 1); + t3 = _mm_srli_epi32(t1, 2); + t2 = _mm_srli_epi32(t1, 7); + + t7 = _mm_xor_si128(t7, t3); + t7 = _mm_xor_si128(t7, t2); + t7 = _mm_xor_si128(t7, t6); + t7 = _mm_xor_si128(t7, t1); + r = _mm_xor_si128(t4, t7); + + return r; +} + + +static void gfmul_only(__m128i a, __m128i b, __m128i* r0, __m128i* r1) +{ + __m128i t1, t2, t3, t4; + + /* 128 x 128 Carryless Multiply */ + t2 = _mm_shuffle_epi32(b, 78); + t3 = _mm_shuffle_epi32(a, 78); + t2 = _mm_xor_si128(t2, b); + t3 = _mm_xor_si128(t3, a); + t4 = _mm_clmulepi64_si128(b, a, 0x11); + t1 = _mm_clmulepi64_si128(b, a, 0x00); + t2 = _mm_clmulepi64_si128(t2, t3, 0x00); + t2 = _mm_xor_si128(t2, t1); + t2 = _mm_xor_si128(t2, t4); + t3 = _mm_slli_si128(t2, 8); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + t4 = _mm_xor_si128(t4, t2); + *r0 = _mm_xor_si128(t1, *r0); + *r1 = _mm_xor_si128(t4, *r1); +} + +static __m128i gfmul_shl1(__m128i a) +{ + __m128i t1 = a, t2; + t2 = _mm_srli_epi64(t1, 63); + t1 = _mm_slli_epi64(t1, 1); + t2 = _mm_slli_si128(t2, 8); + t1 = _mm_or_si128(t1, t2); + /* if (a[1] >> 63) t1 = _mm_xor_si128(t1, MOD2_128); */ + a = _mm_shuffle_epi32(a, 0xff); + a = _mm_srai_epi32(a, 31); + a = _mm_and_si128(a, MOD2_128); + t1 = _mm_xor_si128(t1, a); + return t1; +} + +static __m128i ghash_red(__m128i r0, __m128i r1) +{ + __m128i t2, t3; + __m128i t5, t6, t7; + + t5 = _mm_slli_epi32(r0, 31); + t6 = _mm_slli_epi32(r0, 30); + t7 = _mm_slli_epi32(r0, 25); + t5 = _mm_xor_si128(t5, t6); + t5 = _mm_xor_si128(t5, t7); + + t6 = _mm_srli_si128(t5, 4); + t5 = _mm_slli_si128(t5, 12); + r0 = _mm_xor_si128(r0, t5); + t7 = _mm_srli_epi32(r0, 1); + t3 = _mm_srli_epi32(r0, 2); + t2 = _mm_srli_epi32(r0, 7); + + t7 = _mm_xor_si128(t7, t3); + t7 = _mm_xor_si128(t7, t2); + t7 = _mm_xor_si128(t7, t6); + t7 = _mm_xor_si128(t7, r0); + return _mm_xor_si128(r1, t7); +} + +static __m128i gfmul_shifted(__m128i a, __m128i b) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only(a, b, &t0, &t1); + return ghash_red(t0, t1); +} + +#ifndef AES_GCM_AESNI_NO_UNROLL +static __m128i gfmul8(__m128i a1, __m128i a2, __m128i a3, __m128i a4, + __m128i a5, __m128i a6, __m128i a7, __m128i a8, + __m128i b1, __m128i b2, __m128i b3, __m128i b4, + __m128i b5, __m128i b6, __m128i b7, __m128i b8) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only(a1, b8, &t0, &t1); + gfmul_only(a2, b7, &t0, &t1); + gfmul_only(a3, b6, &t0, &t1); + gfmul_only(a4, b5, &t0, &t1); + gfmul_only(a5, b4, &t0, &t1); + gfmul_only(a6, b3, &t0, &t1); + gfmul_only(a7, b2, &t0, &t1); + gfmul_only(a8, b1, &t0, &t1); + return ghash_red(t0, t1); +} +#endif + +#ifdef HAVE_INTEL_AVX2 +static __m128i gfmul_sw_avx2(__m128i a, __m128i b) +{ + __m128i r, t1, t2, t3, t4, t5, t6, t7; + /* 128 x 128 Carryless Multiply */ + t3 = _mm_clmulepi64_si128(a, b, 0x10); + t2 = _mm_clmulepi64_si128(a, b, 0x01); + t1 = _mm_clmulepi64_si128(a, b, 0x00); + t4 = _mm_clmulepi64_si128(a, b, 0x11); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_slli_si128(t3, 8); + t3 = _mm_srli_si128(t3, 8); + t1 = _mm_xor_si128(t1, t2); + t4 = _mm_xor_si128(t4, t3); + + /* shift left 1 bit - bits reversed */ + t5 = _mm_srli_epi32(t1, 31); + t6 = _mm_srli_epi32(t4, 31); + t1 = _mm_slli_epi32(t1, 1); + t4 = _mm_slli_epi32(t4, 1); + t7 = _mm_srli_si128(t5, 12); + t5 = _mm_slli_si128(t5, 4); + t6 = _mm_slli_si128(t6, 4); + t4 = _mm_or_si128(t4, t7); + t1 = _mm_or_si128(t1, t5); + t4 = _mm_or_si128(t4, t6); + + /* Reduction */ + t2 = _mm_clmulepi64_si128(t1, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t1, 78); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t3, 78); + t3 = _mm_xor_si128(t3, t2); + r = _mm_xor_si128(t4, t3); + + return r; +} + +static void gfmul_only_avx2(__m128i a, __m128i b, __m128i* r0, __m128i* r1) +{ + __m128i t1, t2, t3, t4; + + /* 128 x 128 Carryless Multiply */ + t3 = _mm_clmulepi64_si128(a, b, 0x10); + t2 = _mm_clmulepi64_si128(a, b, 0x01); + t1 = _mm_clmulepi64_si128(a, b, 0x00); + t4 = _mm_clmulepi64_si128(a, b, 0x11); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_slli_si128(t3, 8); + t3 = _mm_srli_si128(t3, 8); + t1 = _mm_xor_si128(t1, t2); + t4 = _mm_xor_si128(t4, t3); + *r0 = _mm_xor_si128(t1, *r0); + *r1 = _mm_xor_si128(t4, *r1); +} + +static __m128i ghash_red_avx2(__m128i r0, __m128i r1) +{ + __m128i t2, t3; + t2 = _mm_clmulepi64_si128(r0, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(r0, 78); + t3 = _mm_xor_si128(t3, t2); + t2 = _mm_clmulepi64_si128(t3, MOD2_128, 0x10); + t3 = _mm_shuffle_epi32(t3, 78); + t3 = _mm_xor_si128(t3, t2); + return _mm_xor_si128(r1, t3); +} + +static __m128i gfmul_shifted_avx2(__m128i a, __m128i b) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only_avx2(a, b, &t0, &t1); + return ghash_red_avx2(t0, t1); +} + +#ifndef AES_GCM_AESNI_NO_UNROLL +static __m128i gfmul8_avx2(__m128i a1, __m128i a2, __m128i a3, __m128i a4, + __m128i a5, __m128i a6, __m128i a7, __m128i a8, + __m128i b1, __m128i b2, __m128i b3, __m128i b4, + __m128i b5, __m128i b6, __m128i b7, __m128i b8) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only_avx2(a1, b8, &t0, &t1); + gfmul_only_avx2(a2, b7, &t0, &t1); + gfmul_only_avx2(a3, b6, &t0, &t1); + gfmul_only_avx2(a4, b5, &t0, &t1); + gfmul_only_avx2(a5, b4, &t0, &t1); + gfmul_only_avx2(a6, b3, &t0, &t1); + gfmul_only_avx2(a7, b2, &t0, &t1); + gfmul_only_avx2(a8, b1, &t0, &t1); + return ghash_red_avx2(t0, t1); +} +#endif /* AES_GCM_AESNI_NO_UNROLL */ +#endif /* HAVE_INTEL_AVX2 */ + /* See IntelĀ® Carry-Less Multiplication Instruction * and its Usage for Computing the GCM Mode White Paper @@ -3559,18 +3794,16 @@ static const __m128i EIGHT = M128_INIT(0x0, 0x8); static const __m128i BSWAP_EPI64 = M128_INIT(0x0001020304050607, 0x08090a0b0c0d0e0f); static const __m128i BSWAP_MASK = M128_INIT(0x08090a0b0c0d0e0f, 0x0001020304050607); -#define aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T) \ +#define aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T, X) \ do \ { \ - word32 iv12[4]; \ - iv12[0] = *(word32*)&ivec[0]; \ - iv12[1] = *(word32*)&ivec[4]; \ - iv12[2] = *(word32*)&ivec[8]; \ - iv12[3] = 0x01000000; \ - Y = _mm_loadu_si128((__m128i*)iv12); \ + Y = _mm_setzero_si128(); \ + for (j=0; j < 12; j++) \ + ((unsigned char*)&Y)[j] = ivec[j]; \ + Y = _mm_insert_epi32(Y, 0x1000000, 3); \ \ /* (Compute E[ZERO, KS] and E[Y0, KS] together */ \ - tmp1 = _mm_load_si128(&KEY[0]); \ + tmp1 = _mm_xor_si128(X, KEY[0]); \ tmp2 = _mm_xor_si128(Y, KEY[0]); \ tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); \ @@ -3611,7 +3844,7 @@ do \ } \ while (0) -#define aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T) \ +#define aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T, X) \ do \ { \ if (ibytes % 16) { \ @@ -3619,7 +3852,7 @@ do \ for (j=0; j < (int)(ibytes%16); j++) \ ((unsigned char*)&last_block)[j] = ivec[i*16+j]; \ } \ - tmp1 = _mm_load_si128(&KEY[0]); \ + tmp1 = _mm_xor_si128(X, KEY[0]); \ tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ @@ -3721,2484 +3954,1137 @@ while (0) _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); \ _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); - - -#define _VAR(a) ""#a"" -#define VAR(a) _VAR(a) - -#define HR %%xmm14 -#define XR %%xmm15 -#define KR %%ebx -#define KR64 %%rbx -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) -#define CTR1 128(%%rsp) -#define TR 144(%%rsp) -#define HTR %%rsp -#define STACK_OFFSET 160 -#else -#define CTR1 (%%rsp) -#define TR 16(%%rsp) -#define STACK_OFFSET 32 -#endif - -#define AESENC() \ - "aesenc %%xmm12, %%xmm4\n\t" \ - "aesenc %%xmm12, %%xmm5\n\t" \ - "aesenc %%xmm12, %%xmm6\n\t" \ - "aesenc %%xmm12, %%xmm7\n\t" \ - "aesenc %%xmm12, %%xmm8\n\t" \ - "aesenc %%xmm12, %%xmm9\n\t" \ - "aesenc %%xmm12, %%xmm10\n\t" \ - "aesenc %%xmm12, %%xmm11\n\t" - -#define AESENC_SET(o) \ - "movdqa "#o"(%[KEY]), %%xmm12\n\t" \ - AESENC() - -#define AESENC_CTR() \ - "movdqu "VAR(CTR1)", %%xmm4\n\t" \ - "movdqa %[BSWAP_EPI64], %%xmm1\n\t" \ - "movdqu %%xmm4, %%xmm0\n\t" \ - "pshufb %%xmm1, %%xmm4\n\t" \ - "movdqa %%xmm0, %%xmm5\n\t" \ - "paddd %[ONE], %%xmm5\n\t" \ - "pshufb %%xmm1, %%xmm5\n\t" \ - "movdqa %%xmm0, %%xmm6\n\t" \ - "paddd %[TWO], %%xmm6\n\t" \ - "pshufb %%xmm1, %%xmm6\n\t" \ - "movdqa %%xmm0, %%xmm7\n\t" \ - "paddd %[THREE], %%xmm7\n\t" \ - "pshufb %%xmm1, %%xmm7\n\t" \ - "movdqa %%xmm0, %%xmm8\n\t" \ - "paddd %[FOUR], %%xmm8\n\t" \ - "pshufb %%xmm1, %%xmm8\n\t" \ - "movdqa %%xmm0, %%xmm9\n\t" \ - "paddd %[FIVE], %%xmm9\n\t" \ - "pshufb %%xmm1, %%xmm9\n\t" \ - "movdqa %%xmm0, %%xmm10\n\t" \ - "paddd %[SIX], %%xmm10\n\t" \ - "pshufb %%xmm1, %%xmm10\n\t" \ - "movdqa %%xmm0, %%xmm11\n\t" \ - "paddd %[SEVEN], %%xmm11\n\t" \ - "pshufb %%xmm1, %%xmm11\n\t" \ - "paddd %[EIGHT], %%xmm0\n\t" - -#define AESENC_XOR() \ - "movdqa (%[KEY]), %%xmm12\n\t" \ - "movdqu %%xmm0, "VAR(CTR1)"\n\t" \ - "pxor %%xmm12, %%xmm4\n\t" \ - "pxor %%xmm12, %%xmm5\n\t" \ - "pxor %%xmm12, %%xmm6\n\t" \ - "pxor %%xmm12, %%xmm7\n\t" \ - "pxor %%xmm12, %%xmm8\n\t" \ - "pxor %%xmm12, %%xmm9\n\t" \ - "pxor %%xmm12, %%xmm10\n\t" \ - "pxor %%xmm12, %%xmm11\n\t" - -/* Encrypt and carry-less multiply for AVX1. */ -#define AESENC_PCLMUL_1(src, o1, o2, o3) \ - "movdqu "#o3"("VAR(HTR)"), %%xmm12\n\t" \ - "movdqu "#o2"("#src"), %%xmm0\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm4\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm0\n\t" \ - "pxor %%xmm2, %%xmm0\n\t" \ - "pshufd $0x4e, %%xmm12, %%xmm1\n\t" \ - "pshufd $0x4e, %%xmm0, %%xmm14\n\t" \ - "pxor %%xmm12, %%xmm1\n\t" \ - "pxor %%xmm0, %%xmm14\n\t" \ - "movdqa %%xmm0, %%xmm3\n\t" \ - "pclmulqdq $0x11, %%xmm12, %%xmm3\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm5\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm6\n\t" \ - "movdqa %%xmm0, %%xmm2\n\t" \ - "pclmulqdq $0x00, %%xmm12, %%xmm2\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm7\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm8\n\t" \ - "pclmulqdq $0x00, %%xmm14, %%xmm1\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm9\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm10\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm11\n\t" \ - "pxor %%xmm2, %%xmm1\n\t" \ - "pxor %%xmm3, %%xmm1\n\t" \ - -#define AESENC_PCLMUL_N(src, o1, o2, o3) \ - "movdqu "#o3"("VAR(HTR)"), %%xmm12\n\t" \ - "movdqu "#o2"("#src"), %%xmm0\n\t" \ - "pshufd $0x4e, %%xmm12, %%xmm13\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm0\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm4\n\t" \ - "pxor %%xmm12, %%xmm13\n\t" \ - "pshufd $0x4e, %%xmm0, %%xmm14\n\t" \ - "pxor %%xmm0, %%xmm14\n\t" \ - "movdqa %%xmm0, %%xmm15\n\t" \ - "pclmulqdq $0x11, %%xmm12, %%xmm15\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm5\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm6\n\t" \ - "pclmulqdq $0x00, %%xmm0, %%xmm12\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm7\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm8\n\t" \ - "pclmulqdq $0x00, %%xmm14, %%xmm13\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm9\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm10\n\t" \ - "aesenc "#o1"(%[KEY]), %%xmm11\n\t" \ - "pxor %%xmm12, %%xmm1\n\t" \ - "pxor %%xmm12, %%xmm2\n\t" \ - "pxor %%xmm15, %%xmm1\n\t" \ - "pxor %%xmm15, %%xmm3\n\t" \ - "pxor %%xmm13, %%xmm1\n\t" \ - -#define AESENC_PCLMUL_L(o) \ - "movdqa %%xmm1, %%xmm14\n\t" \ - "psrldq $8, %%xmm1\n\t" \ - "pslldq $8, %%xmm14\n\t" \ - "aesenc "#o"(%[KEY]), %%xmm4\n\t" \ - "pxor %%xmm14, %%xmm2\n\t" \ - "pxor %%xmm1, %%xmm3\n\t" \ - "movdqa %%xmm2, %%xmm12\n\t" \ - "movdqa %%xmm2, %%xmm13\n\t" \ - "movdqa %%xmm2, %%xmm14\n\t" \ - "aesenc "#o"(%[KEY]), %%xmm5\n\t" \ - "pslld $31, %%xmm12\n\t" \ - "pslld $30, %%xmm13\n\t" \ - "pslld $25, %%xmm14\n\t" \ - "aesenc "#o"(%[KEY]), %%xmm6\n\t" \ - "pxor %%xmm13, %%xmm12\n\t" \ - "pxor %%xmm14, %%xmm12\n\t" \ - "aesenc "#o"(%[KEY]), %%xmm7\n\t" \ - "movdqa %%xmm12, %%xmm13\n\t" \ - "pslldq $12, %%xmm12\n\t" \ - "psrldq $4, %%xmm13\n\t" \ - "aesenc "#o"(%[KEY]), %%xmm8\n\t" \ - "pxor %%xmm12, %%xmm2\n\t" \ - "movdqa %%xmm2, %%xmm14\n\t" \ - "movdqa %%xmm2, %%xmm1\n\t" \ - "movdqa %%xmm2, %%xmm0\n\t" \ - "aesenc "#o"(%[KEY]), %%xmm9\n\t" \ - "psrld $1, %%xmm14\n\t" \ - "psrld $2, %%xmm1\n\t" \ - "psrld $7, %%xmm0\n\t" \ - "aesenc "#o"(%[KEY]), %%xmm10\n\t" \ - "pxor %%xmm1, %%xmm14\n\t" \ - "pxor %%xmm0, %%xmm14\n\t" \ - "aesenc "#o"(%[KEY]), %%xmm11\n\t" \ - "pxor %%xmm13, %%xmm14\n\t" \ - "pxor %%xmm14, %%xmm2\n\t" \ - "pxor %%xmm3, %%xmm2\n\t" \ - -/* Encrypt and carry-less multiply with last key. */ -#define AESENC_LAST(in, out) \ - "aesenclast %%xmm12, %%xmm4\n\t" \ - "aesenclast %%xmm12, %%xmm5\n\t" \ - "movdqu ("#in"),%%xmm0\n\t" \ - "movdqu 16("#in"),%%xmm1\n\t" \ - "pxor %%xmm0, %%xmm4\n\t" \ - "pxor %%xmm1, %%xmm5\n\t" \ - "movdqu %%xmm4, ("#out")\n\t" \ - "movdqu %%xmm5, 16("#out")\n\t" \ - "aesenclast %%xmm12, %%xmm6\n\t" \ - "aesenclast %%xmm12, %%xmm7\n\t" \ - "movdqu 32("#in"),%%xmm0\n\t" \ - "movdqu 48("#in"),%%xmm1\n\t" \ - "pxor %%xmm0, %%xmm6\n\t" \ - "pxor %%xmm1, %%xmm7\n\t" \ - "movdqu %%xmm6, 32("#out")\n\t" \ - "movdqu %%xmm7, 48("#out")\n\t" \ - "aesenclast %%xmm12, %%xmm8\n\t" \ - "aesenclast %%xmm12, %%xmm9\n\t" \ - "movdqu 64("#in"),%%xmm0\n\t" \ - "movdqu 80("#in"),%%xmm1\n\t" \ - "pxor %%xmm0, %%xmm8\n\t" \ - "pxor %%xmm1, %%xmm9\n\t" \ - "movdqu %%xmm8, 64("#out")\n\t" \ - "movdqu %%xmm9, 80("#out")\n\t" \ - "aesenclast %%xmm12, %%xmm10\n\t" \ - "aesenclast %%xmm12, %%xmm11\n\t" \ - "movdqu 96("#in"),%%xmm0\n\t" \ - "movdqu 112("#in"),%%xmm1\n\t" \ - "pxor %%xmm0, %%xmm10\n\t" \ - "pxor %%xmm1, %%xmm11\n\t" \ - "movdqu %%xmm10, 96("#out")\n\t" \ - "movdqu %%xmm11, 112("#out")\n\t" - -#define _AESENC_AVX(r) \ - "aesenc 16(%[KEY]), "#r"\n\t" \ - "aesenc 32(%[KEY]), "#r"\n\t" \ - "aesenc 48(%[KEY]), "#r"\n\t" \ - "aesenc 64(%[KEY]), "#r"\n\t" \ - "aesenc 80(%[KEY]), "#r"\n\t" \ - "aesenc 96(%[KEY]), "#r"\n\t" \ - "aesenc 112(%[KEY]), "#r"\n\t" \ - "aesenc 128(%[KEY]), "#r"\n\t" \ - "aesenc 144(%[KEY]), "#r"\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "movdqa 160(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "aesenc %%xmm5, "#r"\n\t" \ - "aesenc 176(%[KEY]), "#r"\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "movdqa 192(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "aesenc %%xmm5, "#r"\n\t" \ - "aesenc 208(%[KEY]), "#r"\n\t" \ - "movdqa 224(%[KEY]), %%xmm5\n\t" \ - "%=:\n\t" \ - "aesenclast %%xmm5, "#r"\n\t" -#define AESENC_AVX(r) \ - _AESENC_AVX(r) - -#define AESENC_BLOCK(in, out) \ - "movdqu "VAR(CTR1)", %%xmm4\n\t" \ - "movdqu %%xmm4, %%xmm5\n\t" \ - "pshufb %[BSWAP_EPI64], %%xmm4\n\t" \ - "paddd %[ONE], %%xmm5\n\t" \ - "pxor (%[KEY]), %%xmm4\n\t" \ - "movdqu %%xmm5, "VAR(CTR1)"\n\t" \ - AESENC_AVX(%%xmm4) \ - "movdqu ("#in"), %%xmm5\n\t" \ - "pxor %%xmm5, %%xmm4\n\t" \ - "movdqu %%xmm4, ("#out")\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ - "pxor %%xmm4, "VAR(XR)"\n\t" - -#define _AESENC_GFMUL(in, out, H, X) \ - "movdqu "VAR(CTR1)", %%xmm4\n\t" \ - "movdqu %%xmm4, %%xmm5\n\t" \ - "pshufb %[BSWAP_EPI64], %%xmm4\n\t" \ - "paddd %[ONE], %%xmm5\n\t" \ - "pxor (%[KEY]), %%xmm4\n\t" \ - "movdqu %%xmm5, "VAR(CTR1)"\n\t" \ - "movdqa "#X", %%xmm6\n\t" \ - "pclmulqdq $0x10, "#H", %%xmm6\n\t" \ - "aesenc 16(%[KEY]), %%xmm4\n\t" \ - "aesenc 32(%[KEY]), %%xmm4\n\t" \ - "movdqa "#X", %%xmm7\n\t" \ - "pclmulqdq $0x01, "#H", %%xmm7\n\t" \ - "aesenc 48(%[KEY]), %%xmm4\n\t" \ - "aesenc 64(%[KEY]), %%xmm4\n\t" \ - "movdqa "#X", %%xmm8\n\t" \ - "pclmulqdq $0x00, "#H", %%xmm8\n\t" \ - "aesenc 80(%[KEY]), %%xmm4\n\t" \ - "movdqa "#X", %%xmm1\n\t" \ - "pclmulqdq $0x11, "#H", %%xmm1\n\t" \ - "aesenc 96(%[KEY]), %%xmm4\n\t" \ - "pxor %%xmm7, %%xmm6\n\t" \ - "movdqa %%xmm6, %%xmm2\n\t" \ - "psrldq $8, %%xmm6\n\t" \ - "pslldq $8, %%xmm2\n\t" \ - "aesenc 112(%[KEY]), %%xmm4\n\t" \ - "movdqa %%xmm1, %%xmm3\n\t" \ - "pxor %%xmm8, %%xmm2\n\t" \ - "pxor %%xmm6, %%xmm3\n\t" \ - "movdqa %[MOD2_128], %%xmm0\n\t" \ - "movdqa %%xmm2, %%xmm7\n\t" \ - "pclmulqdq $0x10, %%xmm0, %%xmm7\n\t" \ - "aesenc 128(%[KEY]), %%xmm4\n\t" \ - "pshufd $0x4e, %%xmm2, %%xmm6\n\t" \ - "pxor %%xmm7, %%xmm6\n\t" \ - "movdqa %%xmm6, %%xmm7\n\t" \ - "pclmulqdq $0x10, %%xmm0, %%xmm7\n\t" \ - "aesenc 144(%[KEY]), %%xmm4\n\t" \ - "pshufd $0x4e, %%xmm6, "VAR(XR)"\n\t" \ - "pxor %%xmm7, "VAR(XR)"\n\t" \ - "pxor %%xmm3, "VAR(XR)"\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "movdqu 160(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "aesenc %%xmm5, %%xmm4\n\t" \ - "aesenc 176(%[KEY]), %%xmm4\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "movdqu 192(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "aesenc %%xmm5, %%xmm4\n\t" \ - "aesenc 208(%[KEY]), %%xmm4\n\t" \ - "movdqa 224(%[KEY]), %%xmm5\n\t" \ - "%=:\n\t" \ - "aesenclast %%xmm5, %%xmm4\n\t" \ - "movdqu ("#in"), %%xmm5\n\t" \ - "pxor %%xmm5, %%xmm4\n\t" \ - "movdqu %%xmm4, ("#out")\n\t" -#define AESENC_GFMUL(in, out, H, X) \ - _AESENC_GFMUL(in, out, H, X) - -#define _GHASH_GFMUL_AVX(r, r2, a, b) \ - "pshufd $0x4e, "#a", %%xmm1\n\t" \ - "pshufd $0x4e, "#b", %%xmm2\n\t" \ - "movdqa "#b", %%xmm3\n\t" \ - "movdqa "#b", %%xmm0\n\t" \ - "pclmulqdq $0x11, "#a", %%xmm3\n\t" \ - "pclmulqdq $0x00, "#a", %%xmm0\n\t" \ - "pxor "#a", %%xmm1\n\t" \ - "pxor "#b", %%xmm2\n\t" \ - "pclmulqdq $0x00, %%xmm2, %%xmm1\n\t" \ - "pxor %%xmm0, %%xmm1\n\t" \ - "pxor %%xmm3, %%xmm1\n\t" \ - "movdqa %%xmm1, %%xmm2\n\t" \ - "movdqa %%xmm0, "#r2"\n\t" \ - "movdqa %%xmm3, "#r"\n\t" \ - "pslldq $8, %%xmm2\n\t" \ - "psrldq $8, %%xmm1\n\t" \ - "pxor %%xmm2, "#r2"\n\t" \ - "pxor %%xmm1, "#r"\n\t" -#define GHASH_GFMUL_AVX(r, r2, a, b) \ - _GHASH_GFMUL_AVX(r, r2, a, b) - -#define _GHASH_GFMUL_XOR_AVX(r, r2, a, b) \ - "pshufd $0x4e, "#a", %%xmm1\n\t" \ - "pshufd $0x4e, "#b", %%xmm2\n\t" \ - "movdqa "#b", %%xmm3\n\t" \ - "movdqa "#b", %%xmm0\n\t" \ - "pclmulqdq $0x11, "#a", %%xmm3\n\t" \ - "pclmulqdq $0x00, "#a", %%xmm0\n\t" \ - "pxor "#a", %%xmm1\n\t" \ - "pxor "#b", %%xmm2\n\t" \ - "pclmulqdq $0x00, %%xmm2, %%xmm1\n\t" \ - "pxor %%xmm0, %%xmm1\n\t" \ - "pxor %%xmm3, %%xmm1\n\t" \ - "movdqa %%xmm1, %%xmm2\n\t" \ - "pxor %%xmm0, "#r2"\n\t" \ - "pxor %%xmm3, "#r"\n\t" \ - "pslldq $8, %%xmm2\n\t" \ - "psrldq $8, %%xmm1\n\t" \ - "pxor %%xmm2, "#r2"\n\t" \ - "pxor %%xmm1, "#r"\n\t" -#define GHASH_GFMUL_XOR_AVX(r, r2, a, b) \ - _GHASH_GFMUL_XOR_AVX(r, r2, a, b) - -#define GHASH_MID_AVX(r, r2) \ - "movdqa "#r2", %%xmm0\n\t" \ - "movdqa "#r", %%xmm1\n\t" \ - "psrld $31, %%xmm0\n\t" \ - "psrld $31, %%xmm1\n\t" \ - "pslld $1, "#r2"\n\t" \ - "pslld $1, "#r"\n\t" \ - "movdqa %%xmm0, %%xmm2\n\t" \ - "pslldq $4, %%xmm0\n\t" \ - "psrldq $12, %%xmm2\n\t" \ - "pslldq $4, %%xmm1\n\t" \ - "por %%xmm2, "#r"\n\t" \ - "por %%xmm0, "#r2"\n\t" \ - "por %%xmm1, "#r"\n\t" - -#define _GHASH_GFMUL_RED_AVX(r, a, b) \ - "pshufd $0x4e, "#a", %%xmm5\n\t" \ - "pshufd $0x4e, "#b", %%xmm6\n\t" \ - "movdqa "#b", %%xmm7\n\t" \ - "movdqa "#b", %%xmm4\n\t" \ - "pclmulqdq $0x11, "#a", %%xmm7\n\t" \ - "pclmulqdq $0x00, "#a", %%xmm4\n\t" \ - "pxor "#a", %%xmm5\n\t" \ - "pxor "#b", %%xmm6\n\t" \ - "pclmulqdq $0x00, %%xmm6, %%xmm5\n\t" \ - "pxor %%xmm4, %%xmm5\n\t" \ - "pxor %%xmm7, %%xmm5\n\t" \ - "movdqa %%xmm5, %%xmm6\n\t" \ - "movdqa %%xmm7, "#r"\n\t" \ - "pslldq $8, %%xmm6\n\t" \ - "psrldq $8, %%xmm5\n\t" \ - "pxor %%xmm6, %%xmm4\n\t" \ - "pxor %%xmm5, "#r"\n\t" \ - "movdqa %%xmm4, %%xmm8\n\t" \ - "movdqa %%xmm4, %%xmm9\n\t" \ - "movdqa %%xmm4, %%xmm10\n\t" \ - "pslld $31, %%xmm8\n\t" \ - "pslld $30, %%xmm9\n\t" \ - "pslld $25, %%xmm10\n\t" \ - "pxor %%xmm9, %%xmm8\n\t" \ - "pxor %%xmm10, %%xmm8\n\t" \ - "movdqa %%xmm8, %%xmm9\n\t" \ - "psrldq $4, %%xmm9\n\t" \ - "pslldq $12, %%xmm8\n\t" \ - "pxor %%xmm8, %%xmm4\n\t" \ - "movdqa %%xmm4, %%xmm10\n\t" \ - "movdqa %%xmm4, %%xmm6\n\t" \ - "movdqa %%xmm4, %%xmm5\n\t" \ - "psrld $1, %%xmm10\n\t" \ - "psrld $2, %%xmm6\n\t" \ - "psrld $7, %%xmm5\n\t" \ - "pxor %%xmm6, %%xmm10\n\t" \ - "pxor %%xmm5, %%xmm10\n\t" \ - "pxor %%xmm9, %%xmm10\n\t" \ - "pxor %%xmm4, %%xmm10\n\t" \ - "pxor %%xmm10, "#r"\n\t" -#define GHASH_GFMUL_RED_AVX(r, a, b) \ - _GHASH_GFMUL_RED_AVX(r, a, b) - -#define GHASH_RED_AVX(r, r2) \ - "movdqa "#r2", %%xmm0\n\t" \ - "movdqa "#r2", %%xmm1\n\t" \ - "movdqa "#r2", %%xmm2\n\t" \ - "pslld $31, %%xmm0\n\t" \ - "pslld $30, %%xmm1\n\t" \ - "pslld $25, %%xmm2\n\t" \ - "pxor %%xmm1, %%xmm0\n\t" \ - "pxor %%xmm2, %%xmm0\n\t" \ - "movdqa %%xmm0, %%xmm1\n\t" \ - "psrldq $4, %%xmm1\n\t" \ - "pslldq $12, %%xmm0\n\t" \ - "pxor %%xmm0, "#r2"\n\t" \ - "movdqa "#r2", %%xmm2\n\t" \ - "movdqa "#r2", %%xmm3\n\t" \ - "movdqa "#r2", %%xmm0\n\t" \ - "psrld $1, %%xmm2\n\t" \ - "psrld $2, %%xmm3\n\t" \ - "psrld $7, %%xmm0\n\t" \ - "pxor %%xmm3, %%xmm2\n\t" \ - "pxor %%xmm0, %%xmm2\n\t" \ - "pxor %%xmm1, %%xmm2\n\t" \ - "pxor "#r2", %%xmm2\n\t" \ - "pxor %%xmm2, "#r"\n\t" - -#define GHASH_GFMUL_RED_XOR_AVX(r, r2, a, b) \ - GHASH_GFMUL_XOR_AVX(r, r2, a, b) \ - GHASH_RED_AVX(r, r2) - -#define GHASH_FULL_AVX(r, r2, a, b) \ - GHASH_GFMUL_AVX(r, r2, a, b) \ - GHASH_MID_AVX(r, r2) \ - GHASH_RED_AVX(r, r2) - -#define CALC_IV_12() \ - "# Calculate values when IV is 12 bytes\n\t" \ - "# Set counter based on IV\n\t" \ - "movl $0x01000000, %%ecx\n\t" \ - "pinsrq $0, 0(%%rax), %%xmm13\n\t" \ - "pinsrd $2, 8(%%rax), %%xmm13\n\t" \ - "pinsrd $3, %%ecx, %%xmm13\n\t" \ - "# H = Encrypt X(=0) and T = Encrypt counter\n\t" \ - "movdqu %%xmm13, %%xmm1\n\t" \ - "movdqa 0(%[KEY]), "VAR(HR)"\n\t" \ - "pxor "VAR(HR)", %%xmm1\n\t" \ - "movdqa 16(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 32(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 48(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 64(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 80(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 96(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 112(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 128(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 144(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "movdqa 160(%[KEY]), %%xmm12\n\t" \ - "jl 31f\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqa 176(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "movdqa 192(%[KEY]), %%xmm12\n\t" \ - "jl 31f\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqu 208(%[KEY]), %%xmm12\n\t" \ - "aesenc %%xmm12, "VAR(HR)"\n\t" \ - "aesenc %%xmm12, %%xmm1\n\t" \ - "movdqu 224(%[KEY]), %%xmm12\n\t" \ - "31:\n\t" \ - "aesenclast %%xmm12, "VAR(HR)"\n\t" \ - "aesenclast %%xmm12, %%xmm1\n\t" \ - "pshufb %[BSWAP_MASK], "VAR(HR)"\n\t" \ - "movdqu %%xmm1, "VAR(TR)"\n\t" \ - "jmp 39f\n\t" - -#define CALC_IV() \ - "# Calculate values when IV is not 12 bytes\n\t" \ - "# H = Encrypt X(=0)\n\t" \ - "movdqa 0(%[KEY]), "VAR(HR)"\n\t" \ - AESENC_AVX(HR) \ - "pshufb %[BSWAP_MASK], "VAR(HR)"\n\t" \ - "# Calc counter\n\t" \ - "# Initialization vector\n\t" \ - "cmpl $0, %%edx\n\t" \ - "movq $0, %%rcx\n\t" \ - "je 45f\n\t" \ - "cmpl $16, %%edx\n\t" \ - "jl 44f\n\t" \ - "andl $0xfffffff0, %%edx\n\t" \ - "\n" \ - "43:\n\t" \ - "movdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ - "pxor %%xmm4, %%xmm13\n\t" \ - GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR) \ - "addl $16, %%ecx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 43b\n\t" \ - "movl %[ibytes], %%edx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "je 45f\n\t" \ - "\n" \ - "44:\n\t" \ - "subq $16, %%rsp\n\t" \ - "pxor %%xmm4, %%xmm4\n\t" \ - "xorl %%ebx, %%ebx\n\t" \ - "movdqu %%xmm4, (%%rsp)\n\t" \ - "42:\n\t" \ - "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ - "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ - "incl %%ecx\n\t" \ - "incl %%ebx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 42b\n\t" \ - "movdqu (%%rsp), %%xmm4\n\t" \ - "addq $16, %%rsp\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ - "pxor %%xmm4, %%xmm13\n\t" \ - GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR) \ - "\n" \ - "45:\n\t" \ - "# T = Encrypt counter\n\t" \ - "pxor %%xmm0, %%xmm0\n\t" \ - "shll $3, %%edx\n\t" \ - "pinsrq $0, %%rdx, %%xmm0\n\t" \ - "pxor %%xmm0, %%xmm13\n\t" \ - GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR) \ - "pshufb %[BSWAP_MASK], %%xmm13\n\t" \ - "# Encrypt counter\n\t" \ - "movdqa 0(%[KEY]), %%xmm4\n\t" \ - "pxor %%xmm13, %%xmm4\n\t" \ - AESENC_AVX(%%xmm4) \ - "movdqu %%xmm4, "VAR(TR)"\n\t" - -#define CALC_AAD() \ - "# Additional authentication data\n\t" \ - "movl %[abytes], %%edx\n\t" \ - "cmpl $0, %%edx\n\t" \ - "je 25f\n\t" \ - "movq %[addt], %%rax\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "cmpl $16, %%edx\n\t" \ - "jl 24f\n\t" \ - "andl $0xfffffff0, %%edx\n\t" \ - "\n" \ - "23:\n\t" \ - "movdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ - "pxor %%xmm4, "VAR(XR)"\n\t" \ - GHASH_FULL_AVX(XR, %%xmm12, XR, HR) \ - "addl $16, %%ecx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 23b\n\t" \ - "movl %[abytes], %%edx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "je 25f\n\t" \ - "\n" \ - "24:\n\t" \ - "subq $16, %%rsp\n\t" \ - "pxor %%xmm4, %%xmm4\n\t" \ - "xorl %%ebx, %%ebx\n\t" \ - "movdqu %%xmm4, (%%rsp)\n\t" \ - "22:\n\t" \ - "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ - "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ - "incl %%ecx\n\t" \ - "incl %%ebx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 22b\n\t" \ - "movdqu (%%rsp), %%xmm4\n\t" \ - "addq $16, %%rsp\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm4\n\t" \ - "pxor %%xmm4, "VAR(XR)"\n\t" \ - GHASH_FULL_AVX(XR, %%xmm12, XR, HR) \ - "\n" \ - "25:\n\t" - -#define CALC_HT_8_AVX() \ - "movdqa "VAR(XR)", %%xmm2\n\t" \ - "# H ^ 1\n\t" \ - "movdqu "VAR(HR)", 0("VAR(HTR)")\n\t" \ - "# H ^ 2\n\t" \ - GHASH_GFMUL_RED_AVX(%%xmm0, HR, HR) \ - "movdqu %%xmm0 , 16("VAR(HTR)")\n\t" \ - "# H ^ 3\n\t" \ - GHASH_GFMUL_RED_AVX(%%xmm1, HR, %%xmm0) \ - "movdqu %%xmm1 , 32("VAR(HTR)")\n\t" \ - "# H ^ 4\n\t" \ - GHASH_GFMUL_RED_AVX(%%xmm3, %%xmm0, %%xmm0) \ - "movdqu %%xmm3 , 48("VAR(HTR)")\n\t" \ - "# H ^ 5\n\t" \ - GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm0, %%xmm1) \ - "movdqu %%xmm12, 64("VAR(HTR)")\n\t" \ - "# H ^ 6\n\t" \ - GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm1, %%xmm1) \ - "movdqu %%xmm12, 80("VAR(HTR)")\n\t" \ - "# H ^ 7\n\t" \ - GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm1, %%xmm3) \ - "movdqu %%xmm12, 96("VAR(HTR)")\n\t" \ - "# H ^ 8\n\t" \ - GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm3, %%xmm3) \ - "movdqu %%xmm12, 112("VAR(HTR)")\n\t" - -#define AESENC_128_GHASH_AVX(src, o) \ - "leaq (%[in],"VAR(KR64)",1), %%rcx\n\t" \ - "leaq (%[out],"VAR(KR64)",1), %%rdx\n\t" \ - /* src is either %%rcx or %%rdx */ \ - AESENC_CTR() \ - AESENC_XOR() \ - AESENC_PCLMUL_1(src, 16, o-128, 112) \ - AESENC_PCLMUL_N(src, 32, o-112, 96) \ - AESENC_PCLMUL_N(src, 48, o -96, 80) \ - AESENC_PCLMUL_N(src, 64, o -80, 64) \ - AESENC_PCLMUL_N(src, 80, o -64, 48) \ - AESENC_PCLMUL_N(src, 96, o -48, 32) \ - AESENC_PCLMUL_N(src, 112, o -32, 16) \ - AESENC_PCLMUL_N(src, 128, o -16, 0) \ - AESENC_PCLMUL_L(144) \ - "cmpl $11, %[nr]\n\t" \ - "movdqa 160(%[KEY]), %%xmm12\n\t" \ - "jl 4f\n\t" \ - AESENC() \ - AESENC_SET(176) \ - "cmpl $13, %[nr]\n\t" \ - "movdqa 192(%[KEY]), %%xmm12\n\t" \ - "jl 4f\n\t" \ - AESENC() \ - AESENC_SET(208) \ - "movdqa 224(%[KEY]), %%xmm12\n\t" \ - "\n" \ -"4:\n\t" \ - AESENC_LAST(%%rcx, %%rdx) - -#define AESENC_LAST15_ENC_AVX() \ - "movl %[nbytes], %%ecx\n\t" \ - "movl %%ecx, %%edx\n\t" \ - "andl $0x0f, %%ecx\n\t" \ - "jz 55f\n\t" \ - "movdqu "VAR(CTR1)", %%xmm13\n\t" \ - "pshufb %[BSWAP_EPI64], %%xmm13\n\t" \ - "pxor 0(%[KEY]), %%xmm13\n\t" \ - AESENC_AVX(%%xmm13) \ - "subq $16, %%rsp\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "movdqu %%xmm13, (%%rsp)\n\t" \ - "\n" \ - "51:\n\t" \ - "movzbl (%[in],"VAR(KR64)",1), %%r13d\n\t" \ - "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ - "movb %%r13b, (%[out],"VAR(KR64)",1)\n\t" \ - "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ - "incl "VAR(KR)"\n\t" \ - "incl %%ecx\n\t" \ - "cmpl %%edx, "VAR(KR)"\n\t" \ - "jl 51b\n\t" \ - "xorq %%r13, %%r13\n\t" \ - "cmpl $16, %%ecx\n\t" \ - "je 53f\n\t" \ - "\n" \ - "52:\n\t" \ - "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ - "incl %%ecx\n\t" \ - "cmpl $16, %%ecx\n\t" \ - "jl 52b\n\t" \ - "53:\n\t" \ - "movdqu (%%rsp), %%xmm13\n\t" \ - "addq $16, %%rsp\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm13\n\t" \ - "pxor %%xmm13, "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX(XR, HR, XR) \ - -#define AESENC_LAST15_DEC_AVX() \ - "movl %[nbytes], %%ecx\n\t" \ - "movl %%ecx, %%edx\n\t" \ - "andl $0x0f, %%ecx\n\t" \ - "jz 55f\n\t" \ - "movdqu "VAR(CTR1)", %%xmm13\n\t" \ - "pshufb %[BSWAP_EPI64], %%xmm13\n\t" \ - "pxor 0(%[KEY]), %%xmm13\n\t" \ - AESENC_AVX(%%xmm13) \ - "subq $32, %%rsp\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "movdqu %%xmm13, (%%rsp)\n\t" \ - "pxor %%xmm0, %%xmm0\n\t" \ - "movdqu %%xmm0, 16(%%rsp)\n\t" \ - "\n" \ - "51:\n\t" \ - "movzbl (%[in],"VAR(KR64)",1), %%r13d\n\t" \ - "movb %%r13b, 16(%%rsp,%%rcx,1)\n\t" \ - "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ - "movb %%r13b, (%[out],"VAR(KR64)",1)\n\t" \ - "incl "VAR(KR)"\n\t" \ - "incl %%ecx\n\t" \ - "cmpl %%edx, "VAR(KR)"\n\t" \ - "jl 51b\n\t" \ - "53:\n\t" \ - "movdqu 16(%%rsp), %%xmm13\n\t" \ - "addq $32, %%rsp\n\t" \ - "pshufb %[BSWAP_MASK], %%xmm13\n\t" \ - "pxor %%xmm13, "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX(XR, HR, XR) \ - -#define CALC_TAG() \ - "movl %[nbytes], %%edx\n\t" \ - "movl %[abytes], %%ecx\n\t" \ - "shlq $3, %%rdx\n\t" \ - "shlq $3, %%rcx\n\t" \ - "pinsrq $0, %%rdx, %%xmm0\n\t" \ - "pinsrq $1, %%rcx, %%xmm0\n\t" \ - "pxor %%xmm0, "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX(XR, HR, XR) \ - "pshufb %[BSWAP_MASK], "VAR(XR)"\n\t" \ - "movdqu "VAR(TR)", %%xmm0\n\t" \ - "pxor "VAR(XR)", %%xmm0\n\t" \ - - -static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, +void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + unsigned char *tag, unsigned int nbytes, + unsigned int abytes, unsigned int ibytes, + const unsigned char* key, int nr); +void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, unsigned int nbytes, unsigned int abytes, unsigned int ibytes, const unsigned char* key, int nr) { - register const unsigned char* iv asm("rax") = ivec; - - __asm__ __volatile__ ( - "subq $"VAR(STACK_OFFSET)", %%rsp\n\t" - /* Counter is xmm13 */ - "pxor %%xmm13, %%xmm13\n\t" - "pxor "VAR(XR)", "VAR(XR)"\n\t" - "movl %[ibytes], %%edx\n\t" - "cmpl $12, %%edx\n\t" - "jne 35f\n\t" - CALC_IV_12() - "\n" - "35:\n\t" - CALC_IV() - "\n" - "39:\n\t" - - CALC_AAD() - - "# Calculate counter and H\n\t" - "pshufb %[BSWAP_EPI64], %%xmm13\n\t" - "movdqa "VAR(HR)", %%xmm5\n\t" - "paddd %[ONE], %%xmm13\n\t" - "movdqa "VAR(HR)", %%xmm4\n\t" - "movdqu %%xmm13, "VAR(CTR1)"\n\t" - "psrlq $63, %%xmm5\n\t" - "psllq $1, %%xmm4\n\t" - "pslldq $8, %%xmm5\n\t" - "por %%xmm5, %%xmm4\n\t" - "pshufd $0xff, "VAR(HR)", "VAR(HR)"\n\t" - "psrad $31, "VAR(HR)"\n\t" - "pand %[MOD2_128], "VAR(HR)"\n\t" - "pxor %%xmm4, "VAR(HR)"\n\t" - - "xorl "VAR(KR)", "VAR(KR)"\n\t" - -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) - "cmpl $128, %[nbytes]\n\t" - "movl %[nbytes], %%r13d\n\t" - "jl 5f\n\t" - "andl $0xffffff80, %%r13d\n\t" - - CALC_HT_8_AVX() - - "# First 128 bytes of input\n\t" - AESENC_CTR() - AESENC_XOR() - AESENC_SET(16) - AESENC_SET(32) - AESENC_SET(48) - AESENC_SET(64) - AESENC_SET(80) - AESENC_SET(96) - AESENC_SET(112) - AESENC_SET(128) - AESENC_SET(144) - "cmpl $11, %[nr]\n\t" - "movdqa 160(%[KEY]), %%xmm12\n\t" - "jl 1f\n\t" - AESENC() - AESENC_SET(176) - "cmpl $13, %[nr]\n\t" - "movdqa 192(%[KEY]), %%xmm12\n\t" - "jl 1f\n\t" - AESENC() - AESENC_SET(208) - "movdqa 224(%[KEY]), %%xmm12\n\t" - "\n" - "1:\n\t" - AESENC_LAST(%[in], %[out]) - - "cmpl $128, %%r13d\n\t" - "movl $128, "VAR(KR)"\n\t" - "jle 2f\n\t" - - "# More 128 bytes of input\n\t" - "\n" - "3:\n\t" - AESENC_128_GHASH_AVX(%%rdx, 0) - "addl $128, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 3b\n\t" - "\n" - "2:\n\t" - "movdqa %[BSWAP_MASK], %%xmm13\n\t" - "pshufb %%xmm13, %%xmm4\n\t" - "pshufb %%xmm13, %%xmm5\n\t" - "pshufb %%xmm13, %%xmm6\n\t" - "pshufb %%xmm13, %%xmm7\n\t" - "pxor %%xmm2, %%xmm4\n\t" - "pshufb %%xmm13, %%xmm8\n\t" - "pshufb %%xmm13, %%xmm9\n\t" - "pshufb %%xmm13, %%xmm10\n\t" - "pshufb %%xmm13, %%xmm11\n\t" - - "movdqu 112("VAR(HTR)"), %%xmm12\n\t" - GHASH_GFMUL_AVX(XR, %%xmm13, %%xmm4, %%xmm12) - "movdqu 96("VAR(HTR)"), %%xmm12\n\t" - GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm5, %%xmm12) - "movdqu 80("VAR(HTR)"), %%xmm12\n\t" - GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm6, %%xmm12) - "movdqu 64("VAR(HTR)"), %%xmm12\n\t" - GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm7, %%xmm12) - "movdqu 48("VAR(HTR)"), %%xmm12\n\t" - GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm8, %%xmm12) - "movdqu 32("VAR(HTR)"), %%xmm12\n\t" - GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm9, %%xmm12) - "movdqu 16("VAR(HTR)"), %%xmm12\n\t" - GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm10, %%xmm12) - "movdqu ("VAR(HTR)"), %%xmm12\n\t" - GHASH_GFMUL_RED_XOR_AVX(XR, %%xmm13, %%xmm11, %%xmm12) - - "movdqu 0("VAR(HTR)"), "VAR(HR)"\n\t" - "\n" - "5:\n\t" - "movl %[nbytes], %%edx\n\t" - "cmpl %%edx, "VAR(KR)"\n\t" - "jge 55f\n\t" + int i, j ,k; + __m128i ctr1; + __m128i H, Y, T; + __m128i X = _mm_setzero_si128(); + __m128i *KEY = (__m128i*)key, lastKey; + __m128i last_block = _mm_setzero_si128(); + __m128i tmp1, tmp2; +#ifndef AES_GCM_AESNI_NO_UNROLL + __m128i HT[8]; + __m128i r0, r1; + __m128i XV; + __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; #endif - "movl %[nbytes], %%r13d\n\t" - "andl $0xfffffff0, %%r13d\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jge 14f\n\t" + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T, X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T, X); - "leaq (%[in],"VAR(KR64)",1), %%rcx\n\t" - "leaq (%[out],"VAR(KR64)",1), %%rdx\n\t" - AESENC_BLOCK(%%rcx, %%rdx) - "addl $16, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jge 13f\n\t" - "\n" - "12:\n\t" - "leaq (%[in],"VAR(KR64)",1), %%rcx\n\t" - "leaq (%[out],"VAR(KR64)",1), %%rdx\n\t" - AESENC_GFMUL(%%rcx, %%rdx, HR, XR) - "pshufb %[BSWAP_MASK], %%xmm4\n\t" - "pxor %%xmm4, "VAR(XR)"\n\t" - "addl $16, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 12b\n\t" - "\n" - "13:\n\t" - GHASH_GFMUL_RED_AVX(XR, HR, XR) - "\n" - "14:\n\t" + for (i=0; i < (int)(abytes/16); i++) { + tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw(X, H); + } + if (abytes%16) { + last_block = _mm_setzero_si128(); + for (j=0; j < (int)(abytes%16); j++) + ((unsigned char*)&last_block)[j] = addt[i*16+j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw(X, H); + } - AESENC_LAST15_ENC_AVX() - "\n" - "55:\n\t" + tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); + ctr1 = _mm_add_epi32(tmp1, ONE); + H = gfmul_shl1(H); - CALC_TAG() - "movdqu %%xmm0, (%[tag])\n\t" - "addq $"VAR(STACK_OFFSET)", %%rsp\n\t" +#ifndef AES_GCM_AESNI_NO_UNROLL + i = 0; + if (nbytes >= 16*8) { + HT[0] = H; + HT[1] = gfmul_shifted(H, H); + HT[2] = gfmul_shifted(H, HT[1]); + HT[3] = gfmul_shifted(HT[1], HT[1]); + HT[4] = gfmul_shifted(HT[1], HT[2]); + HT[5] = gfmul_shifted(HT[2], HT[2]); + HT[6] = gfmul_shifted(HT[2], HT[3]); + HT[7] = gfmul_shifted(HT[3], HT[3]); - : - : [KEY] "r" (key), - [in] "r" (in), [out] "r" (out), [nr] "r" (nr), - [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), - [ivec] "r" (iv), [ibytes] "r" (ibytes), - [tag] "r" (tag), - [BSWAP_MASK] "m" (BSWAP_MASK), - [BSWAP_EPI64] "m" (BSWAP_EPI64), - [ONE] "m" (ONE), -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) - [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), - [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), - [EIGHT] "m" (EIGHT), + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp2 = _mm_add_epi32(ctr1, ONE); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); + tmp3 = _mm_add_epi32(ctr1, TWO); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); + tmp4 = _mm_add_epi32(ctr1, THREE); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); + tmp5 = _mm_add_epi32(ctr1, FOUR); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); + tmp6 = _mm_add_epi32(ctr1, FIVE); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); + tmp7 = _mm_add_epi32(ctr1, SIX); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); + tmp8 = _mm_add_epi32(ctr1, SEVEN); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, EIGHT); + tmp1 =_mm_xor_si128(tmp1, KEY[0]); + tmp2 =_mm_xor_si128(tmp2, KEY[0]); + tmp3 =_mm_xor_si128(tmp3, KEY[0]); + tmp4 =_mm_xor_si128(tmp4, KEY[0]); + tmp5 =_mm_xor_si128(tmp5, KEY[0]); + tmp6 =_mm_xor_si128(tmp6, KEY[0]); + tmp7 =_mm_xor_si128(tmp7, KEY[0]); + tmp8 =_mm_xor_si128(tmp8, KEY[0]); + AES_ENC_8(1); + AES_ENC_8(2); + AES_ENC_8(3); + AES_ENC_8(4); + AES_ENC_8(5); + AES_ENC_8(6); + AES_ENC_8(7); + AES_ENC_8(8); + AES_ENC_8(9); + lastKey = KEY[10]; + if (nr > 10) { + AES_ENC_8(10); + AES_ENC_8(11); + lastKey = KEY[12]; + if (nr > 12) { + AES_ENC_8(12); + AES_ENC_8(13); + lastKey = KEY[14]; + } + } + AES_ENC_LAST_8(); + + for (i=1; i < (int)(nbytes/16/8); i++) { + r0 = _mm_setzero_si128(); + r1 = _mm_setzero_si128(); + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp2 = _mm_add_epi32(ctr1, ONE); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); + tmp3 = _mm_add_epi32(ctr1, TWO); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); + tmp4 = _mm_add_epi32(ctr1, THREE); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); + tmp5 = _mm_add_epi32(ctr1, FOUR); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); + tmp6 = _mm_add_epi32(ctr1, FIVE); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); + tmp7 = _mm_add_epi32(ctr1, SIX); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); + tmp8 = _mm_add_epi32(ctr1, SEVEN); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, EIGHT); + tmp1 =_mm_xor_si128(tmp1, KEY[0]); + tmp2 =_mm_xor_si128(tmp2, KEY[0]); + tmp3 =_mm_xor_si128(tmp3, KEY[0]); + tmp4 =_mm_xor_si128(tmp4, KEY[0]); + tmp5 =_mm_xor_si128(tmp5, KEY[0]); + tmp6 =_mm_xor_si128(tmp6, KEY[0]); + tmp7 =_mm_xor_si128(tmp7, KEY[0]); + tmp8 =_mm_xor_si128(tmp8, KEY[0]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+0]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + gfmul_only(XV, HT[7], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+1]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[6], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+2]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[5], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+3]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[4], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+4]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[3], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+5]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[2], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+6]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[1], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+7]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[0], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); + /* Reduction */ + X = ghash_red(r0, r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); + lastKey = KEY[14]; + } + } + AES_ENC_LAST_8(); + } + + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); + tmp1 = _mm_xor_si128(X, tmp1); + X = gfmul8(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, + HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); + } + for (k = i*8; k < (int)(nbytes/16); k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + } +#else + for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + } + for (; k < (int)(nbytes/16); k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + X = gfmul_shifted(X, H); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + } + if (k > 0) { + X = gfmul_shifted(X, H); + } #endif - [MOD2_128] "m" (MOD2_128) - : "xmm15", "xmm14", "xmm13", "xmm12", - "xmm0", "xmm1", "xmm2", "xmm3", "memory", - "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", - "rbx", "rcx", "rdx", "r13" - ); + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + } + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + _mm_storeu_si128((__m128i*)tag, T); } #ifdef HAVE_INTEL_AVX1 /* Encrypt with key in xmm12. */ -#define VAESENC() \ - "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" \ - "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" \ - "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" \ - "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" \ - "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" \ - "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" \ - "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" \ - "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" +#define VAESENC() \ + "vaesenc %%xmm12, %[tmp1], %[tmp1]\n\t" \ + "vaesenc %%xmm12, %[tmp2], %[tmp2]\n\t" \ + "vaesenc %%xmm12, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm12, %[tmp4], %[tmp4]\n\t" \ + "vaesenc %%xmm12, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm12, %[tmp6], %[tmp6]\n\t" \ + "vaesenc %%xmm12, %[tmp7], %[tmp7]\n\t" \ + "vaesenc %%xmm12, %[tmp8], %[tmp8]\n\t" -#define VAESENC_SET(o) \ - "vmovdqa "#o"(%[KEY]), %%xmm12\n\t" \ +#define VAESENC_SET(o) \ + "vmovaps "#o"(%[KEY]), %%xmm12\n\t" \ VAESENC() #define VAESENC_CTR() \ - "vmovdqu "VAR(CTR1)", %%xmm0\n\t" \ - "vmovdqa %[BSWAP_EPI64], %%xmm1\n\t" \ - "vpshufb %%xmm1, %%xmm0, %%xmm4\n\t" \ - "vpaddd %[ONE], %%xmm0, %%xmm5\n\t" \ - "vpshufb %%xmm1, %%xmm5, %%xmm5\n\t" \ - "vpaddd %[TWO], %%xmm0, %%xmm6\n\t" \ - "vpshufb %%xmm1, %%xmm6, %%xmm6\n\t" \ - "vpaddd %[THREE], %%xmm0, %%xmm7\n\t" \ - "vpshufb %%xmm1, %%xmm7, %%xmm7\n\t" \ - "vpaddd %[FOUR], %%xmm0, %%xmm8\n\t" \ - "vpshufb %%xmm1, %%xmm8, %%xmm8\n\t" \ - "vpaddd %[FIVE], %%xmm0, %%xmm9\n\t" \ - "vpshufb %%xmm1, %%xmm9, %%xmm9\n\t" \ - "vpaddd %[SIX], %%xmm0, %%xmm10\n\t" \ - "vpshufb %%xmm1, %%xmm10, %%xmm10\n\t" \ - "vpaddd %[SEVEN], %%xmm0, %%xmm11\n\t" \ - "vpshufb %%xmm1, %%xmm11, %%xmm11\n\t" \ + "vmovaps (%[pctr1]), %%xmm0\n\t" \ + "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" \ + "vpshufb %%xmm1, %%xmm0, %[tmp1]\n\t" \ + "vpaddd %[ONE], %%xmm0, %[tmp2]\n\t" \ + "vpshufb %%xmm1, %[tmp2], %[tmp2]\n\t" \ + "vpaddd %[TWO], %%xmm0, %[tmp3]\n\t" \ + "vpshufb %%xmm1, %[tmp3], %[tmp3]\n\t" \ + "vpaddd %[THREE], %%xmm0, %[tmp4]\n\t" \ + "vpshufb %%xmm1, %[tmp4], %[tmp4]\n\t" \ + "vpaddd %[FOUR], %%xmm0, %[tmp5]\n\t" \ + "vpshufb %%xmm1, %[tmp5], %[tmp5]\n\t" \ + "vpaddd %[FIVE], %%xmm0, %[tmp6]\n\t" \ + "vpshufb %%xmm1, %[tmp6], %[tmp6]\n\t" \ + "vpaddd %[SIX], %%xmm0, %[tmp7]\n\t" \ + "vpshufb %%xmm1, %[tmp7], %[tmp7]\n\t" \ + "vpaddd %[SEVEN], %%xmm0, %[tmp8]\n\t" \ + "vpshufb %%xmm1, %[tmp8], %[tmp8]\n\t" \ "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" #define VAESENC_XOR() \ - "vmovdqa (%[KEY]), %%xmm12\n\t" \ - "vmovdqu %%xmm0, "VAR(CTR1)"\n\t" \ - "vpxor %%xmm12, %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm12, %%xmm5, %%xmm5\n\t" \ - "vpxor %%xmm12, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm12, %%xmm7, %%xmm7\n\t" \ - "vpxor %%xmm12, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm12, %%xmm9, %%xmm9\n\t" \ - "vpxor %%xmm12, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm12, %%xmm11, %%xmm11\n\t" + "vmovaps (%[KEY]), %%xmm12\n\t" \ + "vmovaps %%xmm0, (%[pctr1])\n\t" \ + "vpxor %%xmm12, %[tmp1], %[tmp1]\n\t" \ + "vpxor %%xmm12, %[tmp2], %[tmp2]\n\t" \ + "vpxor %%xmm12, %[tmp3], %[tmp3]\n\t" \ + "vpxor %%xmm12, %[tmp4], %[tmp4]\n\t" \ + "vpxor %%xmm12, %[tmp5], %[tmp5]\n\t" \ + "vpxor %%xmm12, %[tmp6], %[tmp6]\n\t" \ + "vpxor %%xmm12, %[tmp7], %[tmp7]\n\t" \ + "vpxor %%xmm12, %[tmp8], %[tmp8]\n\t" -#define VAESENC_128() \ - VAESENC_CTR() \ - VAESENC_XOR() \ - VAESENC_SET(16) \ - VAESENC_SET(32) \ - VAESENC_SET(48) \ - VAESENC_SET(64) \ - VAESENC_SET(80) \ - VAESENC_SET(96) \ - VAESENC_SET(112) \ - VAESENC_SET(128) \ - VAESENC_SET(144) \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm12\n\t" \ - "jl 1f\n\t" \ - VAESENC() \ - VAESENC_SET(176) \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm12\n\t" \ - "jl 1f\n\t" \ - VAESENC() \ - VAESENC_SET(208) \ - "vmovdqa 224(%[KEY]), %%xmm12\n\t" \ - "\n" \ -"1:\n\t" \ - VAESENC_LAST(%[in], %[out]) /* Encrypt and carry-less multiply for AVX1. */ -#define VAESENC_PCLMUL_1(src, o1, o2, o3) \ - "vmovdqu "#o3"("VAR(HTR)"), %%xmm12\n\t" \ - "vmovdqu "#o2"("#src"), %%xmm0\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm0, %%xmm0\n\t" \ - "vpxor %%xmm2, %%xmm0, %%xmm0\n\t" \ - "vpshufd $0x4e, %%xmm12, %%xmm1\n\t" \ - "vpshufd $0x4e, %%xmm0, %%xmm14\n\t" \ - "vpxor %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ - "vpclmulqdq $0x11, %%xmm12, %%xmm0, %%xmm3\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm5, %%xmm5\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x00, %%xmm12, %%xmm0, %%xmm2\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm7, %%xmm7\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm8, %%xmm8\n\t" \ - "vpclmulqdq $0x00, %%xmm14, %%xmm1, %%xmm1\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm9, %%xmm9\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm10, %%xmm10\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm11, %%xmm11\n\t" \ - "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm3, %%xmm1, %%xmm1\n\t" \ +#define VAESENC_PCLMUL_1(src, o1, o2, o3) \ + "vmovdqa "#o3"(%[HT]), %%xmm12\n\t" \ + "vmovdqu "#o2"("#src"), %%xmm0\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm0, %%xmm0\n\t" \ + "vpxor %[XV], %%xmm0, %%xmm0\n\t" \ + "vpshufd $78, %%xmm12, %%xmm1\n\t" \ + "vpshufd $78, %%xmm0, %%xmm14\n\t" \ + "vpxor %%xmm12, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ + "vpclmulqdq $0x11, %%xmm12, %%xmm0, %%xmm3\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp2], %[tmp2]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp3], %[tmp3]\n\t" \ + "vpclmulqdq $0x00, %%xmm12, %%xmm0, %%xmm2\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp4], %[tmp4]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp5], %[tmp5]\n\t" \ + "vpclmulqdq $0x00, %%xmm14, %%xmm1, %%xmm1\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp6], %[tmp6]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp7], %[tmp7]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm3, %%xmm1, %%xmm1\n\t" \ -#define VAESENC_PCLMUL_N(src, o1, o2, o3) \ - "vmovdqu "#o3"("VAR(HTR)"), %%xmm12\n\t" \ - "vmovdqu "#o2"("#src"), %%xmm0\n\t" \ - "vpshufd $0x4e, %%xmm12, %%xmm13\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm0, %%xmm0\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm12, %%xmm13, %%xmm13\n\t" \ - "vpshufd $0x4e, %%xmm0, %%xmm14\n\t" \ - "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ - "vpclmulqdq $0x11, %%xmm12, %%xmm0, %%xmm15\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm5, %%xmm5\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x00, %%xmm12, %%xmm0, %%xmm12\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm7, %%xmm7\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm8, %%xmm8\n\t" \ - "vpclmulqdq $0x00, %%xmm14, %%xmm13, %%xmm13\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm9, %%xmm9\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm10, %%xmm10\n\t" \ - "vaesenc "#o1"(%[KEY]), %%xmm11, %%xmm11\n\t" \ - "vpxor %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm15, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm15, %%xmm3, %%xmm3\n\t" \ - "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ +#define VAESENC_PCLMUL_N(src, o1, o2, o3) \ + "vmovdqa "#o3"(%[HT]), %%xmm12\n\t" \ + "vmovdqu "#o2"("#src"), %%xmm0\n\t" \ + "vpshufd $78, %%xmm12, %%xmm13\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm0, %%xmm0\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vpxor %%xmm12, %%xmm13, %%xmm13\n\t" \ + "vpshufd $78, %%xmm0, %%xmm14\n\t" \ + "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ + "vpclmulqdq $0x11, %%xmm12, %%xmm0, %%xmm15\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp2], %[tmp2]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp3], %[tmp3]\n\t" \ + "vpclmulqdq $0x00, %%xmm12, %%xmm0, %%xmm12\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp4], %[tmp4]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp5], %[tmp5]\n\t" \ + "vpclmulqdq $0x00, %%xmm14, %%xmm13, %%xmm13\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp6], %[tmp6]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp7], %[tmp7]\n\t" \ + "vaesenc "#o1"(%[KEY]), %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm12, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ + "vpxor %%xmm15, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm15, %%xmm3, %%xmm3\n\t" \ + "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ + +#define VAESENC_PCLMUL_L(o) \ + "vpslldq $8, %%xmm1, %%xmm14\n\t" \ + "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp2], %[tmp2]\n\t" \ + "vpslld $31, %%xmm2, %%xmm12\n\t" \ + "vpslld $30, %%xmm2, %%xmm13\n\t" \ + "vpslld $25, %%xmm2, %%xmm14\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp3], %[tmp3]\n\t" \ + "vpxor %%xmm13, %%xmm12, %%xmm12\n\t" \ + "vpxor %%xmm14, %%xmm12, %%xmm12\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp4], %[tmp4]\n\t" \ + "vpsrldq $4, %%xmm12, %%xmm13\n\t" \ + "vpslldq $12, %%xmm12, %%xmm12\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp5], %[tmp5]\n\t" \ + "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ + "vpsrld $1, %%xmm2, %%xmm14\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp6], %[tmp6]\n\t" \ + "vpsrld $2, %%xmm2, %%xmm1\n\t" \ + "vpsrld $7, %%xmm2, %%xmm0\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp7], %[tmp7]\n\t" \ + "vpxor %%xmm1, %%xmm14, %%xmm14\n\t" \ + "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ + "vaesenc "#o"(%[KEY]), %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm13, %%xmm14, %%xmm14\n\t" \ + "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ + "vpxor %%xmm3, %%xmm2, %%xmm2\n\t" \ + + +/* Encrypt and carry-less multiply for AVX2. */ +#define VAESENC_PCLMUL_AVX2_1(src, o1, o2, o3) \ + "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ + "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ + "vpxor %[XV], %%xmm12, %%xmm12\n\t" \ + "vpclmulqdq $0x10, "#o3"(%[HT]), %%xmm12, %%xmm1\n\t" \ + "vaesenc %%xmm0, %[tmp1], %[tmp1]\n\t" \ + "vaesenc %%xmm0, %[tmp2], %[tmp2]\n\t" \ + "vpclmulqdq $0x01, "#o3"(%[HT]), %%xmm12, %%xmm14\n\t" \ + "vaesenc %%xmm0, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm0, %[tmp4], %[tmp4]\n\t" \ + "vpclmulqdq $0x00, "#o3"(%[HT]), %%xmm12, %%xmm2\n\t" \ + "vaesenc %%xmm0, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm0, %[tmp6], %[tmp6]\n\t" \ + "vpclmulqdq $0x11, "#o3"(%[HT]), %%xmm12, %%xmm3\n\t" \ + "vaesenc %%xmm0, %[tmp7], %[tmp7]\n\t" \ + "vaesenc %%xmm0, %[tmp8], %[tmp8]\n\t" \ -#define VAESENC_PCLMUL_L(o) \ - "vpslldq $8, %%xmm1, %%xmm14\n\t" \ - "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ - "vaesenc "#o"(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ - "vaesenc "#o"(%[KEY]), %%xmm5, %%xmm5\n\t" \ - "vpslld $31, %%xmm2, %%xmm12\n\t" \ - "vpslld $30, %%xmm2, %%xmm13\n\t" \ - "vpslld $25, %%xmm2, %%xmm14\n\t" \ - "vaesenc "#o"(%[KEY]), %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm13, %%xmm12, %%xmm12\n\t" \ - "vpxor %%xmm14, %%xmm12, %%xmm12\n\t" \ - "vaesenc "#o"(%[KEY]), %%xmm7, %%xmm7\n\t" \ - "vpsrldq $4, %%xmm12, %%xmm13\n\t" \ - "vpslldq $12, %%xmm12, %%xmm12\n\t" \ - "vaesenc "#o"(%[KEY]), %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ - "vpsrld $1, %%xmm2, %%xmm14\n\t" \ - "vaesenc "#o"(%[KEY]), %%xmm9, %%xmm9\n\t" \ - "vpsrld $2, %%xmm2, %%xmm1\n\t" \ - "vpsrld $7, %%xmm2, %%xmm0\n\t" \ - "vaesenc "#o"(%[KEY]), %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm1, %%xmm14, %%xmm14\n\t" \ - "vpxor %%xmm0, %%xmm14, %%xmm14\n\t" \ - "vaesenc "#o"(%[KEY]), %%xmm11, %%xmm11\n\t" \ - "vpxor %%xmm13, %%xmm14, %%xmm14\n\t" \ - "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm3, %%xmm2, %%xmm2\n\t" \ +#define VAESENC_PCLMUL_AVX2_2(src, o1, o2, o3) \ + "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ + "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ + "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ + "vpclmulqdq $0x10, "#o3"(%[HT]), %%xmm12, %%xmm13\n\t" \ + "vaesenc %%xmm0, %[tmp1], %[tmp1]\n\t" \ + "vaesenc %%xmm0, %[tmp2], %[tmp2]\n\t" \ + "vpclmulqdq $0x01, "#o3"(%[HT]), %%xmm12, %%xmm14\n\t" \ + "vaesenc %%xmm0, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm0, %[tmp4], %[tmp4]\n\t" \ + "vpclmulqdq $0x00, "#o3"(%[HT]), %%xmm12, %%xmm15\n\t" \ + "vaesenc %%xmm0, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm0, %[tmp6], %[tmp6]\n\t" \ + "vpclmulqdq $0x11, "#o3"(%[HT]), %%xmm12, %%xmm12\n\t" \ + "vaesenc %%xmm0, %[tmp7], %[tmp7]\n\t" \ + "vaesenc %%xmm0, %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm12, %%xmm3, %%xmm3\n\t" \ + "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ + +#define VAESENC_PCLMUL_AVX2_N(src, o1, o2, o3) \ + "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ + "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ + "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ + "vpclmulqdq $0x10, "#o3"(%[HT]), %%xmm12, %%xmm13\n\t" \ + "vaesenc %%xmm0, %[tmp1], %[tmp1]\n\t" \ + "vaesenc %%xmm0, %[tmp2], %[tmp2]\n\t" \ + "vpclmulqdq $0x01, "#o3"(%[HT]), %%xmm12, %%xmm14\n\t" \ + "vaesenc %%xmm0, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm0, %[tmp4], %[tmp4]\n\t" \ + "vpclmulqdq $0x00, "#o3"(%[HT]), %%xmm12, %%xmm15\n\t" \ + "vaesenc %%xmm0, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm0, %[tmp6], %[tmp6]\n\t" \ + "vpclmulqdq $0x11, "#o3"(%[HT]), %%xmm12, %%xmm12\n\t" \ + "vaesenc %%xmm0, %[tmp7], %[tmp7]\n\t" \ + "vaesenc %%xmm0, %[tmp8], %[tmp8]\n\t" \ + "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ + "vpxor %%xmm12, %%xmm3, %%xmm3\n\t" \ + "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ + +#define VAESENC_PCLMUL_AVX2_L(o) \ + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ + "vpslldq $8, %%xmm1, %%xmm12\n\t" \ + "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ + "vmovdqa "#o"(%[KEY]), %%xmm15\n\t" \ + "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ + "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ + "vaesenc %%xmm15, %[tmp1], %[tmp1]\n\t" \ + "vmovdqa %[MOD2_128], %%xmm0\n\t" \ + "vpclmulqdq $0x10, %%xmm0, %%xmm2, %%xmm14\n\t" \ + "vaesenc %%xmm15, %[tmp2], %[tmp2]\n\t" \ + "vaesenc %%xmm15, %[tmp3], %[tmp3]\n\t" \ + "vaesenc %%xmm15, %[tmp4], %[tmp4]\n\t" \ + "vpshufd $78, %%xmm2, %%xmm13\n\t" \ + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" \ + "vpclmulqdq $0x10, %%xmm0, %%xmm13, %%xmm14\n\t" \ + "vaesenc %%xmm15, %[tmp5], %[tmp5]\n\t" \ + "vaesenc %%xmm15, %[tmp6], %[tmp6]\n\t" \ + "vaesenc %%xmm15, %[tmp7], %[tmp7]\n\t" \ + "vpshufd $78, %%xmm13, %%xmm13\n\t" \ + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" \ + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" \ + "vmovdqa %%xmm13, %%xmm2\n\t" \ + "vaesenc %%xmm15, %[tmp8], %[tmp8]\n\t" /* Encrypt and carry-less multiply with last key. */ -#define VAESENC_LAST(in, out) \ - "vaesenclast %%xmm12, %%xmm4, %%xmm4\n\t" \ - "vaesenclast %%xmm12, %%xmm5, %%xmm5\n\t" \ - "vmovdqu ("#in"), %%xmm0\n\t" \ - "vmovdqu 16("#in"), %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm1, %%xmm5, %%xmm5\n\t" \ - "vmovdqu %%xmm4, ("#out")\n\t" \ - "vmovdqu %%xmm5, 16("#out")\n\t" \ - "vaesenclast %%xmm12, %%xmm6, %%xmm6\n\t" \ - "vaesenclast %%xmm12, %%xmm7, %%xmm7\n\t" \ - "vmovdqu 32("#in"), %%xmm0\n\t" \ - "vmovdqu 48("#in"), %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm1, %%xmm7, %%xmm7\n\t" \ - "vmovdqu %%xmm6, 32("#out")\n\t" \ - "vmovdqu %%xmm7, 48("#out")\n\t" \ - "vaesenclast %%xmm12, %%xmm8, %%xmm8\n\t" \ - "vaesenclast %%xmm12, %%xmm9, %%xmm9\n\t" \ - "vmovdqu 64("#in"), %%xmm0\n\t" \ - "vmovdqu 80("#in"), %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm1, %%xmm9, %%xmm9\n\t" \ - "vmovdqu %%xmm8, 64("#out")\n\t" \ - "vmovdqu %%xmm9, 80("#out")\n\t" \ - "vaesenclast %%xmm12, %%xmm10, %%xmm10\n\t" \ - "vaesenclast %%xmm12, %%xmm11, %%xmm11\n\t" \ - "vmovdqu 96("#in"), %%xmm0\n\t" \ - "vmovdqu 112("#in"), %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm1, %%xmm11, %%xmm11\n\t" \ - "vmovdqu %%xmm10, 96("#out")\n\t" \ - "vmovdqu %%xmm11, 112("#out")\n\t" +#define VAESENC_LAST() \ + "vaesenclast %%xmm12, %[tmp1], %[tmp1]\n\t" \ + "vaesenclast %%xmm12, %[tmp2], %[tmp2]\n\t" \ + "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" \ + "vpxor 16(%[in]), %[tmp2], %[tmp2]\n\t" \ + "vmovdqu %[tmp1], (%[out])\n\t" \ + "vmovdqu %[tmp2], 16(%[out])\n\t" \ + "vaesenclast %%xmm12, %[tmp3], %[tmp3]\n\t" \ + "vaesenclast %%xmm12, %[tmp4], %[tmp4]\n\t" \ + "vpxor 32(%[in]), %[tmp3], %[tmp3]\n\t" \ + "vpxor 48(%[in]), %[tmp4], %[tmp4]\n\t" \ + "vmovdqu %[tmp3], 32(%[out])\n\t" \ + "vmovdqu %[tmp4], 48(%[out])\n\t" \ + "vaesenclast %%xmm12, %[tmp5], %[tmp5]\n\t" \ + "vaesenclast %%xmm12, %[tmp6], %[tmp6]\n\t" \ + "vpxor 64(%[in]), %[tmp5], %[tmp5]\n\t" \ + "vpxor 80(%[in]), %[tmp6], %[tmp6]\n\t" \ + "vmovdqu %[tmp5], 64(%[out])\n\t" \ + "vmovdqu %[tmp6], 80(%[out])\n\t" \ + "vaesenclast %%xmm12, %[tmp7], %[tmp7]\n\t" \ + "vaesenclast %%xmm12, %[tmp8], %[tmp8]\n\t" \ + "vpxor 96(%[in]), %[tmp7], %[tmp7]\n\t" \ + "vpxor 112(%[in]), %[tmp8], %[tmp8]\n\t" \ + "vmovdqu %[tmp7], 96(%[out])\n\t" \ + "vmovdqu %[tmp8], 112(%[out])\n\t" -#define VAESENC_BLOCK() \ - "vmovdqu "VAR(CTR1)", %%xmm5\n\t" \ - "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" \ - "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" \ - "vmovdqu %%xmm5, "VAR(CTR1)"\n\t" \ - "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ - "%=:\n\t" \ - "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vmovdqu (%[in],"VAR(KR64)",1), %%xmm5\n\t" \ - "vpxor %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vmovdqu %%xmm4, (%[out],"VAR(KR64)",1)\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, "VAR(XR)", "VAR(XR)"\n\t" +#define VAESENC_BLOCK() \ + "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" \ + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" \ + "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "cmpl $11, %[nr]\n\t" \ + "vmovaps 160(%[KEY]), %[tmp2]\n\t" \ + "jl %=f\n\t" \ + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" \ + "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "cmpl $13, %[nr]\n\t" \ + "vmovaps 192(%[KEY]), %[tmp2]\n\t" \ + "jl %=f\n\t" \ + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" \ + "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" \ + "vmovaps 224(%[KEY]), %[tmp2]\n\t" \ + "%=:\n\t" \ + "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" \ + "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" \ + "vmovdqu %[tmp1], (%[out])\n\t" \ + "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" \ + "vpxor %[tmp1], %[X], %[X]\n\t" -#define _VAESENC_GFMUL(in, H, X) \ - "vmovdqu "VAR(CTR1)", %%xmm5\n\t" \ - "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" \ - "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" \ - "vmovdqu %%xmm5, "VAR(CTR1)"\n\t" \ - "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpclmulqdq $0x10, "#H", "#X", %%xmm6\n\t" \ - "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpclmulqdq $0x01, "#H", "#X", %%xmm7\n\t" \ - "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpclmulqdq $0x00, "#H", "#X", %%xmm8\n\t" \ - "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpclmulqdq $0x11, "#H", "#X", %%xmm1\n\t" \ - "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm7, %%xmm6, %%xmm6\n\t" \ - "vpslldq $8, %%xmm6, %%xmm2\n\t" \ - "vpsrldq $8, %%xmm6, %%xmm6\n\t" \ - "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm8, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm6, %%xmm1, %%xmm3\n\t" \ - "vmovdqa %[MOD2_128], %%xmm0\n\t" \ - "vpclmulqdq $0x10, %%xmm0, %%xmm2, %%xmm7\n\t" \ - "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpshufd $0x4e, %%xmm2, %%xmm6\n\t" \ - "vpxor %%xmm7, %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x10, %%xmm0, %%xmm6, %%xmm7\n\t" \ - "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm7, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm3, %%xmm6, "VAR(XR)"\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ - "jl 1f\n\t" \ - "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ - "jl 1f\n\t" \ - "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ - "1:\n\t" \ - "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vmovdqu "#in", %%xmm0\n\t" \ - "vpxor %%xmm0, %%xmm4, %%xmm4\n\t" \ - "vmovdqu %%xmm4, (%[out],"VAR(KR64)",1)\n\t" -#define VAESENC_GFMUL(in, H, X) \ - _VAESENC_GFMUL(in, H, X) +#define aes_gcm_avx1_calc_iv_12(kKEY, ivec, nr, H, Y, T, X) \ +do \ +{ \ + for (j=0; j < 12; j++) \ + ((unsigned char*)&Y)[j] = ivec[j]; \ + Y = _mm_insert_epi32(Y, 0x1000000, 3); \ + \ + __asm__ __volatile__ ( \ + "vmovaps 0(%[KEY]), %%xmm5\n\t" \ + "vmovaps 16(%[KEY]), %%xmm6\n\t" \ + "vpxor %%xmm5, %[X], %[H]\n\t" \ + "vpxor %%xmm5, %[Y], %[T]\n\t" \ + "vaesenc %%xmm6, %[H], %[H]\n\t" \ + "vaesenc %%xmm6, %[T], %[T]\n\t" \ + "vmovaps 32(%[KEY]), %%xmm5\n\t" \ + "vmovaps 48(%[KEY]), %%xmm6\n\t" \ + "vaesenc %%xmm5, %[H], %[H]\n\t" \ + "vaesenc %%xmm5, %[T], %[T]\n\t" \ + "vaesenc %%xmm6, %[H], %[H]\n\t" \ + "vaesenc %%xmm6, %[T], %[T]\n\t" \ + "vmovaps 64(%[KEY]), %%xmm5\n\t" \ + "vmovaps 80(%[KEY]), %%xmm6\n\t" \ + "vaesenc %%xmm5, %[H], %[H]\n\t" \ + "vaesenc %%xmm5, %[T], %[T]\n\t" \ + "vaesenc %%xmm6, %[H], %[H]\n\t" \ + "vaesenc %%xmm6, %[T], %[T]\n\t" \ + "vmovaps 96(%[KEY]), %%xmm5\n\t" \ + "vmovaps 112(%[KEY]), %%xmm6\n\t" \ + "vaesenc %%xmm5, %[H], %[H]\n\t" \ + "vaesenc %%xmm5, %[T], %[T]\n\t" \ + "vaesenc %%xmm6, %[H], %[H]\n\t" \ + "vaesenc %%xmm6, %[T], %[T]\n\t" \ + "vmovaps 128(%[KEY]), %%xmm5\n\t" \ + "vmovaps 144(%[KEY]), %%xmm6\n\t" \ + "vaesenc %%xmm5, %[H], %[H]\n\t" \ + "vaesenc %%xmm5, %[T], %[T]\n\t" \ + "vaesenc %%xmm6, %[H], %[H]\n\t" \ + "vaesenc %%xmm6, %[T], %[T]\n\t" \ + "cmpl $11, %[nr]\n\t" \ + "vmovaps 160(%[KEY]), %%xmm5\n\t" \ + "jl %=f\n\t" \ + "vaesenc %%xmm5, %[H], %[H]\n\t" \ + "vaesenc %%xmm5, %[T], %[T]\n\t" \ + "vmovaps 176(%[KEY]), %%xmm6\n\t" \ + "vmovaps 192(%[KEY]), %%xmm5\n\t" \ + "vaesenc %%xmm6, %[H], %[H]\n\t" \ + "vaesenc %%xmm6, %[T], %[T]\n\t" \ + "cmpl $13, %[nr]\n\t" \ + "jl %=f\n\t" \ + "vaesenc %%xmm5, %[H], %[H]\n\t" \ + "vaesenc %%xmm5, %[T], %[T]\n\t" \ + "vmovaps 208(%[KEY]), %%xmm6\n\t" \ + "vmovaps 224(%[KEY]), %%xmm5\n\t" \ + "vaesenc %%xmm6, %[H], %[H]\n\t" \ + "vaesenc %%xmm6, %[T], %[T]\n\t" \ + "%=:\n\t" \ + "vaesenclast %%xmm5, %[H], %[H]\n\t" \ + "vaesenclast %%xmm5, %[T], %[T]\n\t" \ + "vpshufb %[BSWAP_MASK], %[H], %[H]\n\t" \ + \ + : [H] "=xr" (H), [Y] "+xr" (Y), [T] "=xr" (T), \ + [X] "+xr" (X) \ + : [KEY] "r" (KEY), [nr] "r" (nr), \ + [BSWAP_MASK] "m" (BSWAP_MASK) \ + : "memory", "xmm5", "xmm6" \ + ); \ +} \ +while (0) - -#define _GHASH_GFMUL_AVX1(r, r2, a, b) \ - "vpshufd $0x4e, "#a", %%xmm1\n\t" \ - "vpshufd $0x4e, "#b", %%xmm2\n\t" \ - "vpclmulqdq $0x11, "#a", "#b", %%xmm3\n\t" \ - "vpclmulqdq $0x00, "#a", "#b", %%xmm0\n\t" \ - "vpxor "#a", %%xmm1, %%xmm1\n\t" \ - "vpxor "#b", %%xmm2, %%xmm2\n\t" \ - "vpclmulqdq $0x00, %%xmm2, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm3, %%xmm1, %%xmm1\n\t" \ - "vmovdqa %%xmm0, "#r2"\n\t" \ - "vmovdqa %%xmm3, "#r"\n\t" \ - "vpslldq $8, %%xmm1, %%xmm2\n\t" \ - "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm2, "#r2", "#r2"\n\t" \ - "vpxor %%xmm1, "#r", "#r"\n\t" -#define GHASH_GFMUL_AVX1(r, r2, a, b) \ - _GHASH_GFMUL_AVX1(r, r2, a, b) - -#define _GHASH_GFMUL_XOR_AVX1(r, r2, a, b) \ - "vpshufd $0x4e, "#a", %%xmm1\n\t" \ - "vpshufd $0x4e, "#b", %%xmm2\n\t" \ - "vpclmulqdq $0x11, "#a", "#b", %%xmm3\n\t" \ - "vpclmulqdq $0x00, "#a", "#b", %%xmm0\n\t" \ - "vpxor "#a", %%xmm1, %%xmm1\n\t" \ - "vpxor "#b", %%xmm2, %%xmm2\n\t" \ - "vpclmulqdq $0x00, %%xmm2, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm3, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm0, "#r2", "#r2"\n\t" \ - "vpxor %%xmm3, "#r", "#r"\n\t" \ - "vpslldq $8, %%xmm1, %%xmm2\n\t" \ - "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm2, "#r2", "#r2"\n\t" \ - "vpxor %%xmm1, "#r", "#r"\n\t" -#define GHASH_GFMUL_XOR_AVX1(r, r2, a, b) \ - _GHASH_GFMUL_XOR_AVX1(r, r2, a, b) - -#define GHASH_MID_AVX1(r, r2) \ - "vpsrld $31, "#r2", %%xmm0\n\t" \ - "vpsrld $31, "#r", %%xmm1\n\t" \ - "vpslld $1, "#r2", "#r2"\n\t" \ - "vpslld $1, "#r", "#r"\n\t" \ - "vpsrldq $12, %%xmm0, %%xmm2\n\t" \ - "vpslldq $4, %%xmm0, %%xmm0\n\t" \ - "vpslldq $4, %%xmm1, %%xmm1\n\t" \ - "vpor %%xmm2, "#r", "#r"\n\t" \ - "vpor %%xmm0, "#r2", "#r2"\n\t" \ - "vpor %%xmm1, "#r", "#r"\n\t" - -#define _GHASH_GFMUL_RED_AVX1(r, a, b) \ - "vpshufd $0x4e, "#a", %%xmm5\n\t" \ - "vpshufd $0x4e, "#b", %%xmm6\n\t" \ - "vpclmulqdq $0x11, "#a", "#b", %%xmm7\n\t" \ - "vpclmulqdq $0x00, "#a", "#b", %%xmm4\n\t" \ - "vpxor "#a", %%xmm5, %%xmm5\n\t" \ - "vpxor "#b", %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x00, %%xmm6, %%xmm5, %%xmm5\n\t" \ - "vpxor %%xmm4, %%xmm5, %%xmm5\n\t" \ - "vpxor %%xmm7, %%xmm5, %%xmm5\n\t" \ - "vpslldq $8, %%xmm5, %%xmm6\n\t" \ - "vpsrldq $8, %%xmm5, %%xmm5\n\t" \ - "vpxor %%xmm6, %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm5, %%xmm7, "#r"\n\t" \ - "vpslld $31, %%xmm4, %%xmm8\n\t" \ - "vpslld $30, %%xmm4, %%xmm9\n\t" \ - "vpslld $25, %%xmm4, %%xmm10\n\t" \ - "vpxor %%xmm9, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm10, %%xmm8, %%xmm8\n\t" \ - "vpsrldq $4, %%xmm8, %%xmm9\n\t" \ - "vpslldq $12, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm8, %%xmm4, %%xmm4\n\t" \ - "vpsrld $1, %%xmm4, %%xmm10\n\t" \ - "vpsrld $2, %%xmm4, %%xmm6\n\t" \ - "vpsrld $7, %%xmm4, %%xmm5\n\t" \ - "vpxor %%xmm6, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm5, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm9, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm4, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm10, "#r", "#r"\n\t" -#define GHASH_GFMUL_RED_AVX1(r, a, b) \ - _GHASH_GFMUL_RED_AVX1(r, a, b) - -#define _GHASH_GFSQR_RED_AVX1(r, a) \ - "vpclmulqdq $0x00, "#a", "#a", %%xmm4\n\t" \ - "vpclmulqdq $0x11, "#a", "#a", "#r"\n\t" \ - "vpslld $31, %%xmm4, %%xmm8\n\t" \ - "vpslld $30, %%xmm4, %%xmm9\n\t" \ - "vpslld $25, %%xmm4, %%xmm10\n\t" \ - "vpxor %%xmm9, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm10, %%xmm8, %%xmm8\n\t" \ - "vpsrldq $4, %%xmm8, %%xmm9\n\t" \ - "vpslldq $12, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm8, %%xmm4, %%xmm4\n\t" \ - "vpsrld $1, %%xmm4, %%xmm10\n\t" \ - "vpsrld $2, %%xmm4, %%xmm6\n\t" \ - "vpsrld $7, %%xmm4, %%xmm5\n\t" \ - "vpxor %%xmm6, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm5, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm9, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm4, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm10, "#r", "#r"\n\t" -#define GHASH_GFSQR_RED_AVX1(r, a) \ - _GHASH_GFSQR_RED_AVX1(r, a) - -#define GHASH_RED_AVX1(r, r2) \ - "vpslld $31, "#r2", %%xmm0\n\t" \ - "vpslld $30, "#r2", %%xmm1\n\t" \ - "vpslld $25, "#r2", %%xmm2\n\t" \ - "vpxor %%xmm1, %%xmm0, %%xmm0\n\t" \ - "vpxor %%xmm2, %%xmm0, %%xmm0\n\t" \ - "vmovdqa %%xmm0, %%xmm1\n\t" \ - "vpsrldq $4, %%xmm1, %%xmm1\n\t" \ - "vpslldq $12, %%xmm0, %%xmm0\n\t" \ - "vpxor %%xmm0, "#r2", "#r2"\n\t" \ - "vpsrld $1, "#r2", %%xmm2\n\t" \ - "vpsrld $2, "#r2", %%xmm3\n\t" \ - "vpsrld $7, "#r2", %%xmm0\n\t" \ - "vpxor %%xmm3, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm0, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm1, %%xmm2, %%xmm2\n\t" \ - "vpxor "#r2", %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm2, "#r", "#r"\n\t" - -#define GHASH_GFMUL_RED_XOR_AVX1(r, r2, a, b) \ - GHASH_GFMUL_XOR_AVX1(r, r2, a, b) \ - GHASH_RED_AVX1(r, r2) - -#define GHASH_FULL_AVX1(r, r2, a, b) \ - GHASH_GFMUL_AVX1(r, r2, a, b) \ - GHASH_MID_AVX1(r, r2) \ - GHASH_RED_AVX1(r, r2) - -#define CALC_IV_12_AVX1() \ - "# Calculate values when IV is 12 bytes\n\t" \ - "# Set counter based on IV\n\t" \ - "movl $0x01000000, %%ecx\n\t" \ - "vpinsrq $0, 0(%%rax), %%xmm13, %%xmm13\n\t" \ - "vpinsrd $2, 8(%%rax), %%xmm13, %%xmm13\n\t" \ - "vpinsrd $3, %%ecx, %%xmm13, %%xmm13\n\t" \ - "# H = Encrypt X(=0) and T = Encrypt counter\n\t" \ - "vmovdqa 0(%[KEY]), "VAR(HR)"\n\t" \ - "vpxor "VAR(HR)", %%xmm13, %%xmm1\n\t" \ - "vmovdqa 16(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 32(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 48(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 64(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 80(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 96(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 112(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 128(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 144(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm12\n\t" \ - "jl 31f\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 176(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm12\n\t" \ - "jl 31f\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 208(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqu 224(%[KEY]), %%xmm12\n\t" \ - "31:\n\t" \ - "vaesenclast %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenclast %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vpshufb %[BSWAP_MASK], "VAR(HR)", "VAR(HR)"\n\t" \ - "vmovdqu %%xmm1, "VAR(TR)"\n\t" \ - "jmp 39f\n\t" - -#define CALC_IV_AVX1() \ - "# Calculate values when IV is not 12 bytes\n\t" \ - "# H = Encrypt X(=0)\n\t" \ - "vmovdqa 0(%[KEY]), "VAR(HR)"\n\t" \ - VAESENC_AVX(HR) \ - "vpshufb %[BSWAP_MASK], "VAR(HR)", "VAR(HR)"\n\t" \ - "# Calc counter\n\t" \ - "# Initialization vector\n\t" \ - "cmpl $0, %%edx\n\t" \ - "movq $0, %%rcx\n\t" \ - "je 45f\n\t" \ - "cmpl $16, %%edx\n\t" \ - "jl 44f\n\t" \ - "andl $0xfffffff0, %%edx\n\t" \ - "\n" \ - "43:\n\t" \ - "vmovdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, %%xmm13, %%xmm13\n\t" \ - GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR) \ - "addl $16, %%ecx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 43b\n\t" \ - "movl %[ibytes], %%edx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "je 45f\n\t" \ - "\n" \ - "44:\n\t" \ - "subq $16, %%rsp\n\t" \ - "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" \ - "xorl %%ebx, %%ebx\n\t" \ - "vmovdqu %%xmm4, (%%rsp)\n\t" \ - "42:\n\t" \ - "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ - "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ - "incl %%ecx\n\t" \ - "incl %%ebx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 42b\n\t" \ - "vmovdqu (%%rsp), %%xmm4\n\t" \ - "addq $16, %%rsp\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, %%xmm13, %%xmm13\n\t" \ - GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR) \ - "\n" \ - "45:\n\t" \ - "# T = Encrypt counter\n\t" \ - "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" \ - "shll $3, %%edx\n\t" \ - "vpinsrq $0, %%rdx, %%xmm0, %%xmm0\n\t" \ - "vpxor %%xmm0, %%xmm13, %%xmm13\n\t" \ - GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR) \ - "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ - "# Encrypt counter\n\t" \ - "vmovdqa 0(%[KEY]), %%xmm4\n\t" \ - "vpxor %%xmm13, %%xmm4, %%xmm4\n\t" \ - VAESENC_AVX(%%xmm4) \ - "vmovdqu %%xmm4, "VAR(TR)"\n\t" - -#define CALC_AAD_AVX1() \ - "# Additional authentication data\n\t" \ - "movl %[abytes], %%edx\n\t" \ - "cmpl $0, %%edx\n\t" \ - "je 25f\n\t" \ - "movq %[addt], %%rax\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "cmpl $16, %%edx\n\t" \ - "jl 24f\n\t" \ - "andl $0xfffffff0, %%edx\n\t" \ - "\n" \ - "23:\n\t" \ - "vmovdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_FULL_AVX1(XR, %%xmm12, XR, HR) \ - "addl $16, %%ecx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 23b\n\t" \ - "movl %[abytes], %%edx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "je 25f\n\t" \ - "\n" \ - "24:\n\t" \ - "subq $16, %%rsp\n\t" \ - "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" \ - "xorl %%ebx, %%ebx\n\t" \ - "vmovdqu %%xmm4, (%%rsp)\n\t" \ - "22:\n\t" \ - "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ - "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ - "incl %%ecx\n\t" \ - "incl %%ebx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 22b\n\t" \ - "vmovdqu (%%rsp), %%xmm4\n\t" \ - "addq $16, %%rsp\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_FULL_AVX1(XR, %%xmm12, XR, HR) \ - "\n" \ - "25:\n\t" - -#define CALC_HT_8_AVX1() \ - "vmovdqa "VAR(XR)", %%xmm2\n\t" \ - "# H ^ 1\n\t" \ - "vmovdqu "VAR(HR)", 0("VAR(HTR)")\n\t" \ - "# H ^ 2\n\t" \ - GHASH_GFSQR_RED_AVX1(%%xmm0, HR) \ - "vmovdqu %%xmm0 , 16("VAR(HTR)")\n\t" \ - "# H ^ 3\n\t" \ - GHASH_GFMUL_RED_AVX1(%%xmm1, HR, %%xmm0) \ - "vmovdqu %%xmm1 , 32("VAR(HTR)")\n\t" \ - "# H ^ 4\n\t" \ - GHASH_GFSQR_RED_AVX1(%%xmm3, %%xmm0) \ - "vmovdqu %%xmm3 , 48("VAR(HTR)")\n\t" \ - "# H ^ 5\n\t" \ - GHASH_GFMUL_RED_AVX1(%%xmm12, %%xmm0, %%xmm1) \ - "vmovdqu %%xmm12, 64("VAR(HTR)")\n\t" \ - "# H ^ 6\n\t" \ - GHASH_GFSQR_RED_AVX1(%%xmm12, %%xmm1) \ - "vmovdqu %%xmm12, 80("VAR(HTR)")\n\t" \ - "# H ^ 7\n\t" \ - GHASH_GFMUL_RED_AVX1(%%xmm12, %%xmm1, %%xmm3) \ - "vmovdqu %%xmm12, 96("VAR(HTR)")\n\t" \ - "# H ^ 8\n\t" \ - GHASH_GFSQR_RED_AVX1(%%xmm12, %%xmm3) \ - "vmovdqu %%xmm12, 112("VAR(HTR)")\n\t" - -#define VAESENC_128_GHASH_AVX1(src, o) \ - "leaq (%[in],"VAR(KR64)",1), %%rcx\n\t" \ - "leaq (%[out],"VAR(KR64)",1), %%rdx\n\t" \ - /* src is either %%rcx or %%rdx */ \ - VAESENC_CTR() \ - VAESENC_XOR() \ - VAESENC_PCLMUL_1(src, 16, (o-128), 112) \ - VAESENC_PCLMUL_N(src, 32, (o-112), 96) \ - VAESENC_PCLMUL_N(src, 48, (o- 96), 80) \ - VAESENC_PCLMUL_N(src, 64, (o- 80), 64) \ - VAESENC_PCLMUL_N(src, 80, (o- 64), 48) \ - VAESENC_PCLMUL_N(src, 96, (o- 48), 32) \ - VAESENC_PCLMUL_N(src, 112, (o- 32), 16) \ - VAESENC_PCLMUL_N(src, 128, (o- 16), 0) \ - VAESENC_PCLMUL_L(144) \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm12\n\t" \ - "jl 4f\n\t" \ - VAESENC() \ - VAESENC_SET(176) \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm12\n\t" \ - "jl 4f\n\t" \ - VAESENC() \ - VAESENC_SET(208) \ - "vmovdqa 224(%[KEY]), %%xmm12\n\t" \ - "\n" \ -"4:\n\t" \ - VAESENC_LAST(%%rcx, %%rdx) - -#define _VAESENC_AVX(r) \ - "vaesenc 16(%[KEY]), "#r", "#r"\n\t" \ - "vaesenc 32(%[KEY]), "#r", "#r"\n\t" \ - "vaesenc 48(%[KEY]), "#r", "#r"\n\t" \ - "vaesenc 64(%[KEY]), "#r", "#r"\n\t" \ - "vaesenc 80(%[KEY]), "#r", "#r"\n\t" \ - "vaesenc 96(%[KEY]), "#r", "#r"\n\t" \ - "vaesenc 112(%[KEY]), "#r", "#r"\n\t" \ - "vaesenc 128(%[KEY]), "#r", "#r"\n\t" \ - "vaesenc 144(%[KEY]), "#r", "#r"\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm5, "#r", "#r"\n\t" \ - "vaesenc 176(%[KEY]), "#r", "#r"\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm5, "#r", "#r"\n\t" \ - "vaesenc 208(%[KEY]), "#r", "#r"\n\t" \ - "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ - "%=:\n\t" \ - "vaesenclast %%xmm5, "#r", "#r"\n\t" -#define VAESENC_AVX(r) \ - _VAESENC_AVX(r) - -#define AESENC_LAST15_ENC_AVX1() \ - "movl %[nbytes], %%ecx\n\t" \ - "movl %%ecx, %%edx\n\t" \ - "andl $0x0f, %%ecx\n\t" \ - "jz 55f\n\t" \ - "vmovdqu "VAR(CTR1)", %%xmm13\n\t" \ - "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" \ - "vpxor 0(%[KEY]), %%xmm13, %%xmm13\n\t" \ - VAESENC_AVX(%%xmm13) \ - "subq $16, %%rsp\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "vmovdqu %%xmm13, (%%rsp)\n\t" \ - "\n" \ - "51:\n\t" \ - "movzbl (%[in],"VAR(KR64)",1), %%r13d\n\t" \ - "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ - "movb %%r13b, (%[out],"VAR(KR64)",1)\n\t" \ - "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ - "incl "VAR(KR)"\n\t" \ - "incl %%ecx\n\t" \ - "cmpl %%edx, "VAR(KR)"\n\t" \ - "jl 51b\n\t" \ - "xorq %%r13, %%r13\n\t" \ - "cmpl $16, %%ecx\n\t" \ - "je 53f\n\t" \ - "\n" \ - "52:\n\t" \ - "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ - "incl %%ecx\n\t" \ - "cmpl $16, %%ecx\n\t" \ - "jl 52b\n\t" \ - "53:\n\t" \ - "vmovdqu (%%rsp), %%xmm13\n\t" \ - "addq $16, %%rsp\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ - "vpxor %%xmm13, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX1(XR, HR, XR) \ - -#define AESENC_LAST15_DEC_AVX1() \ - "movl %[nbytes], %%ecx\n\t" \ - "movl %%ecx, %%edx\n\t" \ - "andl $0x0f, %%ecx\n\t" \ - "jz 55f\n\t" \ - "vmovdqu "VAR(CTR1)", %%xmm13\n\t" \ - "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" \ - "vpxor 0(%[KEY]), %%xmm13, %%xmm13\n\t" \ - VAESENC_AVX(%%xmm13) \ - "subq $32, %%rsp\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "vmovdqu %%xmm13, (%%rsp)\n\t" \ - "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" \ - "vmovdqu %%xmm0, 16(%%rsp)\n\t" \ - "\n" \ - "51:\n\t" \ - "movzbl (%[in],"VAR(KR64)",1), %%r13d\n\t" \ - "movb %%r13b, 16(%%rsp,%%rcx,1)\n\t" \ - "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ - "movb %%r13b, (%[out],"VAR(KR64)",1)\n\t" \ - "incl "VAR(KR)"\n\t" \ - "incl %%ecx\n\t" \ - "cmpl %%edx, "VAR(KR)"\n\t" \ - "jl 51b\n\t" \ - "53:\n\t" \ - "vmovdqu 16(%%rsp), %%xmm13\n\t" \ - "addq $32, %%rsp\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ - "vpxor %%xmm13, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX1(XR, HR, XR) \ - -#define CALC_TAG_AVX1() \ - "movl %[nbytes], %%edx\n\t" \ - "movl %[abytes], %%ecx\n\t" \ - "shlq $3, %%rdx\n\t" \ - "shlq $3, %%rcx\n\t" \ - "vpinsrq $0, %%rdx, %%xmm0, %%xmm0\n\t" \ - "vpinsrq $1, %%rcx, %%xmm0, %%xmm0\n\t" \ - "vpxor %%xmm0, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX1(XR, HR, XR) \ - "vpshufb %[BSWAP_MASK], "VAR(XR)", "VAR(XR)"\n\t" \ - "vpxor "VAR(TR)", "VAR(XR)", %%xmm0\n\t" \ - - -static void AES_GCM_encrypt_avx1(const unsigned char *in, unsigned char *out, +void AES_GCM_encrypt_avx1(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + unsigned char *tag, unsigned int nbytes, + unsigned int abytes, unsigned int ibytes, + const unsigned char* key, int nr); +void AES_GCM_encrypt_avx1(const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, unsigned int nbytes, unsigned int abytes, unsigned int ibytes, const unsigned char* key, int nr) { - register const unsigned char* iv asm("rax") = ivec; - - __asm__ __volatile__ ( - "subq $"VAR(STACK_OFFSET)", %%rsp\n\t" - /* Counter is xmm13 */ - "vpxor %%xmm13, %%xmm13, %%xmm13\n\t" - "vpxor "VAR(XR)", "VAR(XR)", "VAR(XR)"\n\t" - "movl %[ibytes], %%edx\n\t" - "cmpl $12, %%edx\n\t" - "jne 35f\n\t" - CALC_IV_12_AVX1() - "\n" - "35:\n\t" - CALC_IV_AVX1() - "\n" - "39:\n\t" - - CALC_AAD_AVX1() - - "# Calculate counter and H\n\t" - "vpsrlq $63, "VAR(HR)", %%xmm5\n\t" - "vpsllq $1, "VAR(HR)", %%xmm4\n\t" - "vpslldq $8, %%xmm5, %%xmm5\n\t" - "vpor %%xmm5, %%xmm4, %%xmm4\n\t" - "vpshufd $0xff, "VAR(HR)", "VAR(HR)"\n\t" - "vpsrad $31, "VAR(HR)", "VAR(HR)"\n\t" - "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" - "vpand %[MOD2_128], "VAR(HR)", "VAR(HR)"\n\t" - "vpaddd %[ONE], %%xmm13, %%xmm13\n\t" - "vpxor %%xmm4, "VAR(HR)", "VAR(HR)"\n\t" - "vmovdqu %%xmm13, "VAR(CTR1)"\n\t" - - "xorl "VAR(KR)", "VAR(KR)"\n\t" - + int i, j ,k; + __m128i ctr1; + __m128i H, T; + __m128i X = _mm_setzero_si128(); + __m128i Y = _mm_setzero_si128(); + __m128i *KEY = (__m128i*)key, lastKey; + __m128i last_block = _mm_setzero_si128(); #if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) - "cmpl $128, %[nbytes]\n\t" - "movl %[nbytes], %%r13d\n\t" - "jl 5f\n\t" - "andl $0xffffff80, %%r13d\n\t" - - CALC_HT_8_AVX1() - - "# First 128 bytes of input\n\t" - VAESENC_128() - - "cmpl $128, %%r13d\n\t" - "movl $128, "VAR(KR)"\n\t" - "jle 2f\n\t" - - "# More 128 bytes of input\n\t" - "\n" - "3:\n\t" - VAESENC_128_GHASH_AVX1(%%rdx, 0) - "addl $128, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 3b\n\t" - "\n" - "2:\n\t" - "vmovdqa %[BSWAP_MASK], %%xmm13\n\t" - "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t" - "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t" - "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t" - "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t" - "vpxor %%xmm2, %%xmm4, %%xmm4\n\t" - "vpshufb %%xmm13, %%xmm8, %%xmm8\n\t" - "vpshufb %%xmm13, %%xmm9, %%xmm9\n\t" - "vpshufb %%xmm13, %%xmm10, %%xmm10\n\t" - "vpshufb %%xmm13, %%xmm11, %%xmm11\n\t" - - "vmovdqu ("VAR(HTR)"), %%xmm12\n\t" - "vmovdqu 16("VAR(HTR)"), %%xmm14\n\t" - GHASH_GFMUL_AVX1(XR, %%xmm13, %%xmm11, %%xmm12) - GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm10, %%xmm14) - "vmovdqu 32("VAR(HTR)"), %%xmm12\n\t" - "vmovdqu 48("VAR(HTR)"), %%xmm14\n\t" - GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm9, %%xmm12) - GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm8, %%xmm14) - "vmovdqu 64("VAR(HTR)"), %%xmm12\n\t" - "vmovdqu 80("VAR(HTR)"), %%xmm14\n\t" - GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm7, %%xmm12) - GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm6, %%xmm14) - "vmovdqu 96("VAR(HTR)"), %%xmm12\n\t" - "vmovdqu 112("VAR(HTR)"), %%xmm14\n\t" - GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm5, %%xmm12) - GHASH_GFMUL_RED_XOR_AVX1(XR, %%xmm13, %%xmm4, %%xmm14) - - "vmovdqu 0("VAR(HTR)"), "VAR(HR)"\n\t" - "\n" - "5:\n\t" - "movl %[nbytes], %%edx\n\t" - "cmpl %%edx, "VAR(KR)"\n\t" - "jge 55f\n\t" + __m128i HT[8]; + register __m128i tmp1 asm("xmm4"); + register __m128i tmp2 asm("xmm5"); + register __m128i tmp3 asm("xmm6"); + register __m128i tmp4 asm("xmm7"); + register __m128i tmp5 asm("xmm8"); + register __m128i tmp6 asm("xmm9"); + register __m128i tmp7 asm("xmm10"); + register __m128i tmp8 asm("xmm11"); + __m128i pctr1[1]; + register __m128i XV asm("xmm2"); +#else + __m128i tmp1, tmp2; #endif - "movl %[nbytes], %%r13d\n\t" - "andl $0xfffffff0, %%r13d\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jge 14f\n\t" + if (ibytes == 12) + aes_gcm_avx1_calc_iv_12(KEY, ivec, nr, H, Y, T, X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T, X); - VAESENC_BLOCK() - "addl $16, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jge 13f\n\t" - "\n" - "12:\n\t" - "vmovdqu (%[in],"VAR(KR64)",1), %%xmm9\n\t" - VAESENC_GFMUL(%%xmm9, HR, XR) - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" - "addl $16, "VAR(KR)"\n\t" - "vpxor %%xmm4, "VAR(XR)", "VAR(XR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 12b\n\t" - "\n" - "13:\n\t" - GHASH_GFMUL_RED_AVX1(XR, HR, XR) - "\n" - "14:\n\t" + for (i=0; i < (int)(abytes/16); i++) { + tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw(X, H); + } + if (abytes%16) { + last_block = _mm_setzero_si128(); + for (j=0; j < (int)(abytes%16); j++) + ((unsigned char*)&last_block)[j] = addt[i*16+j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw(X, H); + } - AESENC_LAST15_ENC_AVX1() - "\n" - "55:\n\t" + tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); + ctr1 = _mm_add_epi32(tmp1, ONE); + H = gfmul_shl1(H); - CALC_TAG_AVX1() - "vmovdqu %%xmm0, (%[tag])\n\t" - "addq $"VAR(STACK_OFFSET)", %%rsp\n\t" - "vzeroupper\n\t" +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) + i = 0; + if (nbytes >= 16*8) { + HT[0] = H; + HT[1] = gfmul_shifted(H, H); + HT[2] = gfmul_shifted(H, HT[1]); + HT[3] = gfmul_shifted(HT[1], HT[1]); + HT[4] = gfmul_shifted(HT[1], HT[2]); + HT[5] = gfmul_shifted(HT[2], HT[2]); + HT[6] = gfmul_shifted(HT[2], HT[3]); + HT[7] = gfmul_shifted(HT[3], HT[3]); - : - : [KEY] "r" (key), - [in] "r" (in), [out] "r" (out), [nr] "r" (nr), - [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), - [ivec] "r" (iv), [ibytes] "r" (ibytes), - [tag] "r" (tag), + pctr1[0] = ctr1; + __asm__ __volatile__ ( + VAESENC_CTR() + VAESENC_XOR() + VAESENC_SET(16) + VAESENC_SET(32) + VAESENC_SET(48) + VAESENC_SET(64) + VAESENC_SET(80) + VAESENC_SET(96) + VAESENC_SET(112) + VAESENC_SET(128) + VAESENC_SET(144) + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl 1f\n\t" + + VAESENC() + VAESENC_SET(176) + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl 1f\n\t" + + VAESENC() + VAESENC_SET(208) + "vmovaps 224(%[KEY]), %%xmm12\n\t" + "\n" + "1:\n\t" + VAESENC_LAST() + + : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), + [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), + [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8) + : [KEY] "r" (KEY), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), [TWO] "m" (TWO), + [THREE] "m" (THREE), [FOUR] "m" (FOUR), + [FIVE] "m" (FIVE), [SIX] "m" (SIX), + [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + + XV = X; + for (i=1; i < (int)(nbytes/16/8); i++) { + __asm__ __volatile__ ( + VAESENC_CTR() + VAESENC_XOR() + VAESENC_PCLMUL_1(%[out], 16, -128, 112) + VAESENC_PCLMUL_N(%[out], 32, -112, 96) + VAESENC_PCLMUL_N(%[out], 48, -96, 80) + VAESENC_PCLMUL_N(%[out], 64, -80, 64) + VAESENC_PCLMUL_N(%[out], 80, -64, 48) + VAESENC_PCLMUL_N(%[out], 96, -48, 32) + VAESENC_PCLMUL_N(%[out], 112, -32, 16) + VAESENC_PCLMUL_N(%[out], 128, -16, 0) + VAESENC_PCLMUL_L(144) + + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + VAESENC() + VAESENC_SET(176) + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + VAESENC() + VAESENC_SET(208) + "vmovaps 224(%[KEY]), %%xmm12\n\t" + + "%=:\n\t" + VAESENC_LAST() + + : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), + [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), + [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8), + [XV] "+xr" (XV) + : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), [TWO] "m" (TWO), + [THREE] "m" (THREE), [FOUR] "m" (FOUR), + [FIVE] "m" (FIVE), [SIX] "m" (SIX), + [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT), + [MOD2_128] "m" (MOD2_128) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + } + X = XV; + ctr1 = pctr1[0]; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); + tmp1 = _mm_xor_si128(X, tmp1); + X = gfmul8(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, + HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); + } + for (k = i*8; k < (int)(nbytes/16); k++) { + __asm__ __volatile__ ( + VAESENC_BLOCK() + + "# Carryless Multiply X by H (128 x 128)\n\t" + "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" + "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" + "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" + "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "# Reduce\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "# End Reduce\n\t" + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), [BSWAP_MASK] "m" (BSWAP_MASK), [BSWAP_EPI64] "m" (BSWAP_EPI64), [ONE] "m" (ONE), -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) - [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), - [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), - [EIGHT] "m" (EIGHT), -#endif [MOD2_128] "m" (MOD2_128) - : "xmm15", "xmm14", "xmm13", "xmm12", - "xmm0", "xmm1", "xmm2", "xmm3", "memory", - "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", - "rbx", "rcx", "rdx", "r13" - ); + : "xmm15", "xmm14", "xmm13", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } +#else + for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { + __asm__ __volatile__ ( + VAESENC_BLOCK() + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), + [MOD2_128] "m" (MOD2_128) + : "memory" + ); + } + for (; k < (int)(nbytes/16); k++) { + __asm__ __volatile__ ( + "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" + "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" + "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" + "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" + "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" + "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" + "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %[tmp2]\n\t" + "jl %=f\n\t" + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" + "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %[tmp2]\n\t" + "jl %=f\n\t" + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" + "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vmovaps 224(%[KEY]), %[tmp2]\n\t" + "%=:\n\t" + "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" + "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" + "vmovdqu %[tmp1], (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" + "vpxor %[tmp1], %[X], %[X]\n\t" + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), + [MOD2_128] "m" (MOD2_128) + : "xmm15", "xmm14", "xmm13", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } + if (k > 0) { + X = gfmul_shifted(X, H); + } +#endif + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + } + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + _mm_storeu_si128((__m128i*)tag, T); } #ifdef HAVE_INTEL_AVX2 -/* Encrypt and carry-less multiply for AVX2. */ -#define VAESENC_PCLMUL_AVX2_1(src, o1, o2, o3) \ - "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ - "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ - "vmovdqu "#o3"("VAR(HTR)"), %%xmm13\n\t" \ - "vpxor %%xmm2, %%xmm12, %%xmm12\n\t" \ - "vpclmulqdq $0x10, %%xmm13, %%xmm12, %%xmm1\n\t" \ - "vpclmulqdq $0x01, %%xmm13, %%xmm12, %%xmm14\n\t" \ - "vpclmulqdq $0x00, %%xmm13, %%xmm12, %%xmm2\n\t" \ - "vpclmulqdq $0x11, %%xmm13, %%xmm12, %%xmm3\n\t" \ - "vaesenc %%xmm0, %%xmm4, %%xmm4\n\t" \ - "vaesenc %%xmm0, %%xmm5, %%xmm5\n\t" \ - "vaesenc %%xmm0, %%xmm6, %%xmm6\n\t" \ - "vaesenc %%xmm0, %%xmm7, %%xmm7\n\t" \ - "vaesenc %%xmm0, %%xmm8, %%xmm8\n\t" \ - "vaesenc %%xmm0, %%xmm9, %%xmm9\n\t" \ - "vaesenc %%xmm0, %%xmm10, %%xmm10\n\t" \ - "vaesenc %%xmm0, %%xmm11, %%xmm11\n\t" \ - -#define VAESENC_PCLMUL_AVX2_2(src, o1, o2, o3) \ - "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ - "vmovdqu "#o3"("VAR(HTR)"), %%xmm0\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ - "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ - "vpclmulqdq $0x10, %%xmm0, %%xmm12, %%xmm13\n\t" \ - "vpclmulqdq $0x01, %%xmm0, %%xmm12, %%xmm14\n\t" \ - "vpclmulqdq $0x00, %%xmm0, %%xmm12, %%xmm15\n\t" \ - "vpclmulqdq $0x11, %%xmm0, %%xmm12, %%xmm12\n\t" \ - "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ - "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm12, %%xmm3, %%xmm3\n\t" \ - "vaesenc %%xmm0, %%xmm4, %%xmm4\n\t" \ - "vaesenc %%xmm0, %%xmm5, %%xmm5\n\t" \ - "vaesenc %%xmm0, %%xmm6, %%xmm6\n\t" \ - "vaesenc %%xmm0, %%xmm7, %%xmm7\n\t" \ - "vaesenc %%xmm0, %%xmm8, %%xmm8\n\t" \ - "vaesenc %%xmm0, %%xmm9, %%xmm9\n\t" \ - "vaesenc %%xmm0, %%xmm10, %%xmm10\n\t" \ - "vaesenc %%xmm0, %%xmm11, %%xmm11\n\t" \ - -#define VAESENC_PCLMUL_AVX2_N(src, o1, o2, o3) \ - "vmovdqu "#o2"("#src"), %%xmm12\n\t" \ - "vmovdqu "#o3"("VAR(HTR)"), %%xmm0\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm12, %%xmm12\n\t" \ - "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ - "vpclmulqdq $0x10, %%xmm0, %%xmm12, %%xmm13\n\t" \ - "vpclmulqdq $0x01, %%xmm0, %%xmm12, %%xmm14\n\t" \ - "vpclmulqdq $0x00, %%xmm0, %%xmm12, %%xmm15\n\t" \ - "vpclmulqdq $0x11, %%xmm0, %%xmm12, %%xmm12\n\t" \ - "vmovdqa "#o1"(%[KEY]), %%xmm0\n\t" \ - "vpxor %%xmm13, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm12, %%xmm3, %%xmm3\n\t" \ - "vaesenc %%xmm0, %%xmm4, %%xmm4\n\t" \ - "vaesenc %%xmm0, %%xmm5, %%xmm5\n\t" \ - "vaesenc %%xmm0, %%xmm6, %%xmm6\n\t" \ - "vaesenc %%xmm0, %%xmm7, %%xmm7\n\t" \ - "vaesenc %%xmm0, %%xmm8, %%xmm8\n\t" \ - "vaesenc %%xmm0, %%xmm9, %%xmm9\n\t" \ - "vaesenc %%xmm0, %%xmm10, %%xmm10\n\t" \ - "vaesenc %%xmm0, %%xmm11, %%xmm11\n\t" \ - -#define VAESENC_PCLMUL_AVX2_L(o) \ - "vpxor %%xmm14, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" \ - "vpslldq $8, %%xmm1, %%xmm12\n\t" \ - "vpsrldq $8, %%xmm1, %%xmm1\n\t" \ - "vmovdqa "#o"(%[KEY]), %%xmm15\n\t" \ - "vmovdqa %[MOD2_128], %%xmm0\n\t" \ - "vaesenc %%xmm15, %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm12, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" \ - "vpclmulqdq $0x10, %%xmm0, %%xmm2, %%xmm14\n\t" \ - "vaesenc %%xmm15, %%xmm5, %%xmm5\n\t" \ - "vaesenc %%xmm15, %%xmm6, %%xmm6\n\t" \ - "vaesenc %%xmm15, %%xmm7, %%xmm7\n\t" \ - "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ - "vpclmulqdq $0x10, %%xmm0, %%xmm2, %%xmm14\n\t" \ - "vaesenc %%xmm15, %%xmm8, %%xmm8\n\t" \ - "vaesenc %%xmm15, %%xmm9, %%xmm9\n\t" \ - "vaesenc %%xmm15, %%xmm10, %%xmm10\n\t" \ - "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm3, %%xmm2, %%xmm2\n\t" \ - "vaesenc %%xmm15, %%xmm11, %%xmm11\n\t" - -#define VAESENC_BLOCK_AVX2() \ - "vmovdqu "VAR(CTR1)", %%xmm5\n\t" \ - "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" \ - "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" \ - "vmovdqu %%xmm5, "VAR(CTR1)"\n\t" \ - "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ - "%=:\n\t" \ - "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vmovdqu (%[in],"VAR(KR64)",1), %%xmm5\n\t" \ - "vpxor %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vmovdqu %%xmm4, (%[out],"VAR(KR64)",1)\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, "VAR(XR)", "VAR(XR)"\n\t" - -/* Karatsuba multiplication - slower - * H01 = H[1] ^ H[0] (top and bottom 64-bits XORed) - */ -#define _VAESENC_GFMUL_AVX2(in, H, X, ctr1, H01) \ - "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm5\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vmovdqa 224(%[KEY]), %%xmm5\n\t" \ - "%=:\n\t" \ - "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" \ - "vmovdqu "#in", %%xmm0\n\t" \ - "vpxor %%xmm0, %%xmm4, %%xmm4\n\t" \ -\ - "vpsrldq $8, "#X", %%xmm2\n\t" \ - "vpxor "#X", %%xmm2, %%xmm2\n\t" \ - "vpclmulqdq $0x00, "#H", "#X", %%xmm5\n\t" \ - "vpclmulqdq $0x11, "#H", "#X", %%xmm8\n\t" \ - "vpclmulqdq $0x00, "#H01", %%xmm2, %%xmm7\n\t" \ - "vpxor %%xmm5, %%xmm7, %%xmm7\n\t" \ - "vpxor %%xmm8, %%xmm7, %%xmm7\n\t" \ - "vpslldq $8, %%xmm7, %%xmm6\n\t" \ - "vpsrldq $8, %%xmm7, %%xmm7\n\t" \ - "vpxor %%xmm7, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ -\ - "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm8, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm5, %%xmm6, "VAR(XR)"\n\t" -#define VAESENC_GFMUL_AVX2(in, H, X, ctr1) \ - _VAESENC_GFMUL_AVX2(in, H, X, ctr1) - -#define _VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1) \ - "vpclmulqdq $0x10, "#H", "#X", %%xmm7\n\t" \ - "vpclmulqdq $0x01, "#H", "#X", %%xmm6\n\t" \ - "vpclmulqdq $0x00, "#H", "#X", %%xmm5\n\t" \ - "vpclmulqdq $0x11, "#H", "#X", %%xmm8\n\t" \ - "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm6, %%xmm7, %%xmm7\n\t" \ - "vpslldq $8, %%xmm7, %%xmm6\n\t" \ - "vpsrldq $8, %%xmm7, %%xmm7\n\t" \ - "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ - "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ - "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm7, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm8, %%xmm6, %%xmm6\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm3\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm3, %%xmm4, %%xmm4\n\t" \ - "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm3\n\t" \ - "jl %=f\n\t" \ - "vaesenc %%xmm3, %%xmm4, %%xmm4\n\t" \ - "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" \ - "vmovdqa 224(%[KEY]), %%xmm3\n\t" \ - "%=:\n\t" \ - "vaesenclast %%xmm3, %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm5, %%xmm6, "VAR(XR)"\n\t" \ - "vmovdqu "#in", %%xmm5\n\t" \ - "vpxor %%xmm5, %%xmm4, %%xmm4\n\t" -#define VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1) \ - _VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1) - - -#define _GHASH_GFMUL_AVX2(r, r2, a, b) \ - "vpclmulqdq $0x10, "#a", "#b", %%xmm2\n\t" \ - "vpclmulqdq $0x01, "#a", "#b", %%xmm1\n\t" \ - "vpclmulqdq $0x00, "#a", "#b", %%xmm0\n\t" \ - "vpclmulqdq $0x11, "#a", "#b", %%xmm3\n\t" \ - "vpxor %%xmm1, %%xmm2, %%xmm2\n\t" \ - "vpslldq $8, %%xmm2, %%xmm1\n\t" \ - "vpsrldq $8, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm1, %%xmm0, "#r2"\n\t" \ - "vpxor %%xmm2, %%xmm3, "#r"\n\t" -#define GHASH_GFMUL_AVX2(r, r2, a, b) \ - _GHASH_GFMUL_AVX2(r, r2, a, b) - -#define GHASH_MID_AVX2(r, r2) \ - "vpsrld $31, "#r2", %%xmm0\n\t" \ - "vpsrld $31, "#r", %%xmm1\n\t" \ - "vpslld $1, "#r2", "#r2"\n\t" \ - "vpslld $1, "#r", "#r"\n\t" \ - "vpsrldq $12, %%xmm0, %%xmm2\n\t" \ - "vpslldq $4, %%xmm0, %%xmm0\n\t" \ - "vpslldq $4, %%xmm1, %%xmm1\n\t" \ - "vpor %%xmm2, "#r", "#r"\n\t" \ - "vpor %%xmm0, "#r2", "#r2"\n\t" \ - "vpor %%xmm1, "#r", "#r"\n\t" - -#define _GHASH_GFMUL_RED_AVX2(r, a, b) \ - "vpclmulqdq $0x10, "#a", "#b", %%xmm7\n\t" \ - "vpclmulqdq $0x01, "#a", "#b", %%xmm6\n\t" \ - "vpclmulqdq $0x00, "#a", "#b", %%xmm5\n\t" \ - "vpxor %%xmm6, %%xmm7, %%xmm7\n\t" \ - "vpslldq $8, %%xmm7, %%xmm6\n\t" \ - "vpsrldq $8, %%xmm7, %%xmm7\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x11, "#a", "#b", %%xmm8\n\t" \ - "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm7, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm8, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm5, %%xmm6, "#r"\n\t" -#define GHASH_GFMUL_RED_AVX2(r, a, b) \ - _GHASH_GFMUL_RED_AVX2(r, a, b) - -#define _GHASH_GFSQR_RED2_AVX2(r, a, mod128) \ - "vpclmulqdq $0x00, "#a", "#a", %%xmm6\n\t" \ - "vpclmulqdq $0x11, "#a", "#a", %%xmm8\n\t" \ - "vpclmulqdq $0x10, "#mod128", %%xmm6, %%xmm5\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x10, "#mod128", %%xmm6, %%xmm5\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm6, %%xmm8, "#r"\n\t" -#define GHASH_GFSQR_RED2_AVX2(r, a, mod128) \ - _GHASH_GFSQR_RED2_AVX2(r, a, mod128) - -#define _GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128) \ - "vpclmulqdq $0x10, "#a", "#b", %%xmm7\n\t" \ - "vpclmulqdq $0x01, "#a", "#b", %%xmm6\n\t" \ - "vpclmulqdq $0x00, "#a", "#b", %%xmm5\n\t" \ - "vpclmulqdq $0x11, "#a", "#b", %%xmm8\n\t" \ - "vpclmulqdq $0x00, "#a", "#a", %%xmm9\n\t" \ - "vpclmulqdq $0x11, "#a", "#a", %%xmm10\n\t" \ - "vpxor %%xmm6, %%xmm7, %%xmm7\n\t" \ - "vpslldq $8, %%xmm7, %%xmm6\n\t" \ - "vpsrldq $8, %%xmm7, %%xmm7\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpclmulqdq $0x10, "#mod128", %%xmm10, %%xmm4\n\t" \ - "vpclmulqdq $0x10, "#mod128", %%xmm6, %%xmm5\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpshufd $0x4e, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm5, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm4, %%xmm10, %%xmm10\n\t" \ - "vpclmulqdq $0x10, "#mod128", %%xmm6, %%xmm5\n\t" \ - "vpclmulqdq $0x10, "#mod128", %%xmm10, %%xmm4\n\t" \ - "vpshufd $0x4e, %%xmm6, %%xmm6\n\t" \ - "vpshufd $0x4e, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm7, %%xmm8, %%xmm8\n\t" \ - "vpxor %%xmm4, %%xmm10, %%xmm10\n\t" \ - "vpxor %%xmm8, %%xmm6, %%xmm6\n\t" \ - "vpxor %%xmm10, %%xmm9, "#rs"\n\t" \ - "vpxor %%xmm5, %%xmm6, "#rm"\n\t" -#define GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128) \ - _GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128) - -#define CALC_HT_8_AVX2() \ - "vmovdqa %[MOD2_128], %%xmm11\n\t" \ - "vmovdqa "VAR(XR)", %%xmm2\n\t" \ - "# H ^ 1 and H ^ 2\n\t" \ - GHASH_GFSQR_RED2_AVX2(%%xmm0, HR, %%xmm11) \ - "vmovdqu "VAR(HR)", 0("VAR(HTR)")\n\t" \ - "vmovdqu %%xmm0 , 16("VAR(HTR)")\n\t" \ - "# H ^ 3 and H ^ 4\n\t" \ - GHASH_GFMUL_SQR_RED2_AVX2(%%xmm1, %%xmm3, HR, %%xmm0, %%xmm11) \ - "vmovdqu %%xmm1 , 32("VAR(HTR)")\n\t" \ - "vmovdqu %%xmm3 , 48("VAR(HTR)")\n\t" \ - "# H ^ 5 and H ^ 6\n\t" \ - GHASH_GFMUL_SQR_RED2_AVX2(%%xmm12, %%xmm0, %%xmm0, %%xmm1, %%xmm11) \ - "vmovdqu %%xmm12, 64("VAR(HTR)")\n\t" \ - "vmovdqu %%xmm0 , 80("VAR(HTR)")\n\t" \ - "# H ^ 7 and H ^ 8\n\t" \ - GHASH_GFMUL_SQR_RED2_AVX2(%%xmm12, %%xmm0, %%xmm1, %%xmm3, %%xmm11) \ - "vmovdqu %%xmm12, 96("VAR(HTR)")\n\t" \ - "vmovdqu %%xmm0 , 112("VAR(HTR)")\n\t" - -#define _GHASH_RED_AVX2(r, r2) \ - "vmovdqa %[MOD2_128], %%xmm2\n\t" \ - "vpclmulqdq $0x10, %%xmm2, "#r2", %%xmm0\n\t" \ - "vpshufd $0x4e, "#r2", %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vpclmulqdq $0x10, %%xmm2, %%xmm1, %%xmm0\n\t" \ - "vpshufd $0x4e, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vpxor %%xmm1, "#r", "#r"\n\t" -#define GHASH_RED_AVX2(r, r2) \ - _GHASH_RED_AVX2(r, r2) - -#define GHASH_FULL_AVX2(r, r2, a, b) \ - GHASH_GFMUL_AVX2(r, r2, a, b) \ - GHASH_MID_AVX2(r, r2) \ - GHASH_RED_AVX2(r, r2) - -#define _GFMUL_3V_AVX2(r, r2, r3, a, b) \ - "vpclmulqdq $0x10, "#a", "#b", "#r3"\n\t" \ - "vpclmulqdq $0x01, "#a", "#b", %%xmm1\n\t" \ - "vpclmulqdq $0x00, "#a", "#b", "#r2"\n\t" \ - "vpclmulqdq $0x11, "#a", "#b", "#r"\n\t" \ - "vpxor %%xmm1, "#r3", "#r3"\n\t" -#define GFMUL_3V_AVX2(r, r2, r3, a, b) \ - _GFMUL_3V_AVX2(r, r2, r3, a, b) - -#define _GFMUL_XOR_3V_AVX2(r, r2, r3, a, b) \ - "vpclmulqdq $0x10, "#a", "#b", %%xmm2\n\t" \ - "vpclmulqdq $0x01, "#a", "#b", %%xmm1\n\t" \ - "vpclmulqdq $0x00, "#a", "#b", %%xmm0\n\t" \ - "vpclmulqdq $0x11, "#a", "#b", %%xmm3\n\t" \ - "vpxor %%xmm1, %%xmm2, %%xmm2\n\t" \ - "vpxor %%xmm3, "#r", "#r"\n\t" \ - "vpxor %%xmm2, "#r3", "#r3"\n\t" \ - "vpxor %%xmm0, "#r2", "#r2"\n\t" -#define GFMUL_XOR_3V_AVX2(r, r2, r3, a, b) \ - _GFMUL_XOR_3V_AVX2(r, r2, r3, a, b) - -#define GHASH_GFMUL_RED_8_AVX2() \ - "vmovdqu ("VAR(HTR)"), %%xmm12\n\t" \ - GFMUL_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm11, %%xmm12) \ - "vmovdqu 16("VAR(HTR)"), %%xmm12\n\t" \ - GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm10, %%xmm12) \ - "vmovdqu 32("VAR(HTR)"), %%xmm11\n\t" \ - "vmovdqu 48("VAR(HTR)"), %%xmm12\n\t" \ - GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm9, %%xmm11) \ - GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm8, %%xmm12) \ - "vmovdqu 64("VAR(HTR)"), %%xmm11\n\t" \ - "vmovdqu 80("VAR(HTR)"), %%xmm12\n\t" \ - GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm7, %%xmm11) \ - GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm6, %%xmm12) \ - "vmovdqu 96("VAR(HTR)"), %%xmm11\n\t" \ - "vmovdqu 112("VAR(HTR)"), %%xmm12\n\t" \ - GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm5, %%xmm11) \ - GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm4, %%xmm12) \ - "vpslldq $8, %%xmm14, %%xmm12\n\t" \ - "vpsrldq $8, %%xmm14, %%xmm14\n\t" \ - "vpxor %%xmm12, %%xmm13, %%xmm13\n\t" \ - "vpxor %%xmm14, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_RED_AVX2(XR, %%xmm13) - -#define CALC_IV_12_AVX2() \ - "# Calculate values when IV is 12 bytes\n\t" \ - "# Set counter based on IV\n\t" \ - "movl $0x01000000, %%ecx\n\t" \ - "vpinsrq $0, 0(%%rax), %%xmm13, %%xmm13\n\t" \ - "vpinsrd $2, 8(%%rax), %%xmm13, %%xmm13\n\t" \ - "vpinsrd $3, %%ecx, %%xmm13, %%xmm13\n\t" \ - "# H = Encrypt X(=0) and T = Encrypt counter\n\t" \ - "vmovdqa 0(%[KEY]), "VAR(HR)"\n\t" \ - "vmovdqa 16(%[KEY]), %%xmm12\n\t" \ - "vpxor "VAR(HR)", %%xmm13, %%xmm1\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 32(%[KEY]), %%xmm0\n\t" \ - "vmovdqa 48(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm0, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 64(%[KEY]), %%xmm0\n\t" \ - "vmovdqa 80(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm0, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 96(%[KEY]), %%xmm0\n\t" \ - "vmovdqa 112(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm0, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqa 128(%[KEY]), %%xmm0\n\t" \ - "vmovdqa 144(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm0, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm0\n\t" \ - "jl 31f\n\t" \ - "vmovdqa 176(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm0, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm0\n\t" \ - "jl 31f\n\t" \ - "vmovdqa 208(%[KEY]), %%xmm12\n\t" \ - "vaesenc %%xmm0, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vaesenc %%xmm12, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenc %%xmm12, %%xmm1, %%xmm1\n\t" \ - "vmovdqu 224(%[KEY]), %%xmm0\n\t" \ - "31:\n\t" \ - "vaesenclast %%xmm0, "VAR(HR)", "VAR(HR)"\n\t" \ - "vaesenclast %%xmm0, %%xmm1, %%xmm1\n\t" \ - "vpshufb %[BSWAP_MASK], "VAR(HR)", "VAR(HR)"\n\t" \ - "vmovdqu %%xmm1, "VAR(TR)"\n\t" \ - -#define CALC_IV_AVX2() \ - "# Calculate values when IV is not 12 bytes\n\t" \ - "# H = Encrypt X(=0)\n\t" \ - "vmovdqa 0(%[KEY]), "VAR(HR)"\n\t" \ - VAESENC_AVX(HR) \ - "vpshufb %[BSWAP_MASK], "VAR(HR)", "VAR(HR)"\n\t" \ - "# Calc counter\n\t" \ - "# Initialization vector\n\t" \ - "cmpl $0, %%edx\n\t" \ - "movq $0, %%rcx\n\t" \ - "je 45f\n\t" \ - "cmpl $16, %%edx\n\t" \ - "jl 44f\n\t" \ - "andl $0xfffffff0, %%edx\n\t" \ - "\n" \ - "43:\n\t" \ - "vmovdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, %%xmm13, %%xmm13\n\t" \ - GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR) \ - "addl $16, %%ecx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 43b\n\t" \ - "movl %[ibytes], %%edx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "je 45f\n\t" \ - "\n" \ - "44:\n\t" \ - "subq $16, %%rsp\n\t" \ - "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" \ - "xorl %%ebx, %%ebx\n\t" \ - "vmovdqu %%xmm4, (%%rsp)\n\t" \ - "42:\n\t" \ - "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ - "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ - "incl %%ecx\n\t" \ - "incl %%ebx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 42b\n\t" \ - "vmovdqu (%%rsp), %%xmm4\n\t" \ - "addq $16, %%rsp\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, %%xmm13, %%xmm13\n\t" \ - GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR) \ - "\n" \ - "45:\n\t" \ - "# T = Encrypt counter\n\t" \ - "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" \ - "shll $3, %%edx\n\t" \ - "vpinsrq $0, %%rdx, %%xmm0, %%xmm0\n\t" \ - "vpxor %%xmm0, %%xmm13, %%xmm13\n\t" \ - GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR) \ - "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ - "# Encrypt counter\n\t" \ - "vmovdqa 0(%[KEY]), %%xmm4\n\t" \ - "vpxor %%xmm13, %%xmm4, %%xmm4\n\t" \ - VAESENC_AVX(%%xmm4) \ - "vmovdqu %%xmm4, "VAR(TR)"\n\t" - -#define CALC_AAD_AVX2() \ - "# Additional authentication data\n\t" \ - "movl %[abytes], %%edx\n\t" \ - "cmpl $0, %%edx\n\t" \ - "je 25f\n\t" \ - "movq %[addt], %%rax\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "cmpl $16, %%edx\n\t" \ - "jl 24f\n\t" \ - "andl $0xfffffff0, %%edx\n\t" \ - "\n" \ - "23:\n\t" \ - "vmovdqu (%%rax,%%rcx,1), %%xmm4\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_FULL_AVX2(XR, %%xmm12, XR, HR) \ - "addl $16, %%ecx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 23b\n\t" \ - "movl %[abytes], %%edx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "je 25f\n\t" \ - "\n" \ - "24:\n\t" \ - "subq $16, %%rsp\n\t" \ - "vpxor %%xmm4, %%xmm4, %%xmm4\n\t" \ - "xorl %%ebx, %%ebx\n\t" \ - "vmovdqu %%xmm4, (%%rsp)\n\t" \ - "22:\n\t" \ - "movzbl (%%rax,%%rcx,1), %%r13d\n\t" \ - "movb %%r13b, (%%rsp,%%rbx,1)\n\t" \ - "incl %%ecx\n\t" \ - "incl %%ebx\n\t" \ - "cmpl %%edx, %%ecx\n\t" \ - "jl 22b\n\t" \ - "vmovdqu (%%rsp), %%xmm4\n\t" \ - "addq $16, %%rsp\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" \ - "vpxor %%xmm4, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_FULL_AVX2(XR, %%xmm12, XR, HR) \ - "\n" \ - "25:\n\t" - -#define VAESENC_128_GHASH_AVX2(src, o) \ - "leaq (%[in],"VAR(KR64)",1), %%rcx\n\t" \ - "leaq (%[out],"VAR(KR64)",1), %%rdx\n\t" \ - /* src is either %%rcx or %%rdx */ \ - VAESENC_CTR() \ - VAESENC_XOR() \ - VAESENC_PCLMUL_AVX2_1(src, 16, (o-128), 112) \ - VAESENC_PCLMUL_AVX2_2(src, 32, (o-112), 96) \ - VAESENC_PCLMUL_AVX2_N(src, 48, (o- 96), 80) \ - VAESENC_PCLMUL_AVX2_N(src, 64, (o- 80), 64) \ - VAESENC_PCLMUL_AVX2_N(src, 80, (o- 64), 48) \ - VAESENC_PCLMUL_AVX2_N(src, 96, (o- 48), 32) \ - VAESENC_PCLMUL_AVX2_N(src, 112, (o- 32), 16) \ - VAESENC_PCLMUL_AVX2_N(src, 128, (o- 16), 0) \ - VAESENC_PCLMUL_AVX2_L(144) \ - "cmpl $11, %[nr]\n\t" \ - "vmovdqa 160(%[KEY]), %%xmm12\n\t" \ - "jl 4f\n\t" \ - VAESENC() \ - VAESENC_SET(176) \ - "cmpl $13, %[nr]\n\t" \ - "vmovdqa 192(%[KEY]), %%xmm12\n\t" \ - "jl 4f\n\t" \ - VAESENC() \ - VAESENC_SET(208) \ - "vmovdqa 224(%[KEY]), %%xmm12\n\t" \ - "\n" \ -"4:\n\t" \ - VAESENC_LAST(%%rcx, %%rdx) - -#define AESENC_LAST15_ENC_AVX2() \ - "movl %[nbytes], %%ecx\n\t" \ - "movl %%ecx, %%edx\n\t" \ - "andl $0x0f, %%ecx\n\t" \ - "jz 55f\n\t" \ - "vmovdqu "VAR(CTR1)", %%xmm13\n\t" \ - "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" \ - "vpxor 0(%[KEY]), %%xmm13, %%xmm13\n\t" \ - VAESENC_AVX(%%xmm13) \ - "subq $16, %%rsp\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "vmovdqu %%xmm13, (%%rsp)\n\t" \ - "\n" \ - "51:\n\t" \ - "movzbl (%[in],"VAR(KR64)",1), %%r13d\n\t" \ - "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ - "movb %%r13b, (%[out],"VAR(KR64)",1)\n\t" \ - "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ - "incl "VAR(KR)"\n\t" \ - "incl %%ecx\n\t" \ - "cmpl %%edx, "VAR(KR)"\n\t" \ - "jl 51b\n\t" \ - "xorq %%r13, %%r13\n\t" \ - "cmpl $16, %%ecx\n\t" \ - "je 53f\n\t" \ - "\n" \ - "52:\n\t" \ - "movb %%r13b, (%%rsp,%%rcx,1)\n\t" \ - "incl %%ecx\n\t" \ - "cmpl $16, %%ecx\n\t" \ - "jl 52b\n\t" \ - "53:\n\t" \ - "vmovdqu (%%rsp), %%xmm13\n\t" \ - "addq $16, %%rsp\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ - "vpxor %%xmm13, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX2(XR, HR, XR) \ - -#define AESENC_LAST15_DEC_AVX2() \ - "movl %[nbytes], %%ecx\n\t" \ - "movl %%ecx, %%edx\n\t" \ - "andl $0x0f, %%ecx\n\t" \ - "jz 55f\n\t" \ - "vmovdqu "VAR(CTR1)", %%xmm13\n\t" \ - "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" \ - "vpxor 0(%[KEY]), %%xmm13, %%xmm13\n\t" \ - VAESENC_AVX(%%xmm13) \ - "subq $32, %%rsp\n\t" \ - "xorl %%ecx, %%ecx\n\t" \ - "vmovdqu %%xmm13, (%%rsp)\n\t" \ - "vpxor %%xmm0, %%xmm0, %%xmm0\n\t" \ - "vmovdqu %%xmm0, 16(%%rsp)\n\t" \ - "\n" \ - "51:\n\t" \ - "movzbl (%[in],"VAR(KR64)",1), %%r13d\n\t" \ - "movb %%r13b, 16(%%rsp,%%rcx,1)\n\t" \ - "xorb (%%rsp,%%rcx,1), %%r13b\n\t" \ - "movb %%r13b, (%[out],"VAR(KR64)",1)\n\t" \ - "incl "VAR(KR)"\n\t" \ - "incl %%ecx\n\t" \ - "cmpl %%edx, "VAR(KR)"\n\t" \ - "jl 51b\n\t" \ - "53:\n\t" \ - "vmovdqu 16(%%rsp), %%xmm13\n\t" \ - "addq $32, %%rsp\n\t" \ - "vpshufb %[BSWAP_MASK], %%xmm13, %%xmm13\n\t" \ - "vpxor %%xmm13, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX2(XR, HR, XR) \ - -#define CALC_TAG_AVX2() \ - "movl %[nbytes], %%edx\n\t" \ - "movl %[abytes], %%ecx\n\t" \ - "shlq $3, %%rdx\n\t" \ - "shlq $3, %%rcx\n\t" \ - "vpinsrq $0, %%rdx, %%xmm0, %%xmm0\n\t" \ - "vpinsrq $1, %%rcx, %%xmm0, %%xmm0\n\t" \ - "vpxor %%xmm0, "VAR(XR)", "VAR(XR)"\n\t" \ - GHASH_GFMUL_RED_AVX2(XR, HR, XR) \ - "vpshufb %[BSWAP_MASK], "VAR(XR)", "VAR(XR)"\n\t" \ - "vpxor "VAR(TR)", "VAR(XR)", %%xmm0\n\t" \ - - static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, @@ -6206,145 +5092,336 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, unsigned int abytes, unsigned int ibytes, const unsigned char* key, int nr) { - register const unsigned char* iv asm("rax") = ivec; - - __asm__ __volatile__ ( - "subq $"VAR(STACK_OFFSET)", %%rsp\n\t" - /* Counter is xmm13 */ - "vpxor %%xmm13, %%xmm13, %%xmm13\n\t" - "vpxor "VAR(XR)", "VAR(XR)", "VAR(XR)"\n\t" - "movl %[ibytes], %%edx\n\t" - "cmpl $12, %%edx\n\t" - "jne 35f\n\t" - CALC_IV_12_AVX2() - "jmp 39f\n\t" - "\n" - "35:\n\t" - CALC_IV_AVX2() - "\n" - "39:\n\t" - - CALC_AAD_AVX2() - - "# Calculate counter and H\n\t" - "vpsrlq $63, "VAR(HR)", %%xmm5\n\t" - "vpsllq $1, "VAR(HR)", %%xmm4\n\t" - "vpslldq $8, %%xmm5, %%xmm5\n\t" - "vpor %%xmm5, %%xmm4, %%xmm4\n\t" - "vpshufd $0xff, "VAR(HR)", "VAR(HR)"\n\t" - "vpsrad $31, "VAR(HR)", "VAR(HR)"\n\t" - "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" - "vpand %[MOD2_128], "VAR(HR)", "VAR(HR)"\n\t" - "vpaddd %[ONE], %%xmm13, %%xmm13\n\t" - "vpxor %%xmm4, "VAR(HR)", "VAR(HR)"\n\t" - "vmovdqu %%xmm13, "VAR(CTR1)"\n\t" - - "xorl "VAR(KR)", "VAR(KR)"\n\t" - -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) - "cmpl $128, %[nbytes]\n\t" - "movl %[nbytes], %%r13d\n\t" - "jl 5f\n\t" - "andl $0xffffff80, %%r13d\n\t" - - CALC_HT_8_AVX2() - - "# First 128 bytes of input\n\t" - VAESENC_128() - - "cmpl $128, %%r13d\n\t" - "movl $128, "VAR(KR)"\n\t" - "jle 2f\n\t" - - "# More 128 bytes of input\n\t" - "\n" - "3:\n\t" - VAESENC_128_GHASH_AVX2(%%rdx, 0) - "addl $128, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 3b\n\t" - "\n" - "2:\n\t" - "vmovdqa %[BSWAP_MASK], %%xmm13\n\t" - "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t" - "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t" - "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t" - "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t" - "vpshufb %%xmm13, %%xmm8, %%xmm8\n\t" - "vpshufb %%xmm13, %%xmm9, %%xmm9\n\t" - "vpshufb %%xmm13, %%xmm10, %%xmm10\n\t" - "vpshufb %%xmm13, %%xmm11, %%xmm11\n\t" - "vpxor %%xmm2, %%xmm4, %%xmm4\n\t" - - GHASH_GFMUL_RED_8_AVX2() - - "vmovdqu 0("VAR(HTR)"), "VAR(HR)"\n\t" - "\n" - "5:\n\t" - "movl %[nbytes], %%edx\n\t" - "cmpl %%edx, "VAR(KR)"\n\t" - "jge 55f\n\t" + int i, j ,k; + __m128i ctr1; + __m128i H, Y, T; + __m128i X = _mm_setzero_si128(); + __m128i *KEY = (__m128i*)key, lastKey; + __m128i last_block = _mm_setzero_si128(); +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) + __m128i HT[8]; + register __m128i tmp1 asm("xmm4"); + register __m128i tmp2 asm("xmm5"); + register __m128i tmp3 asm("xmm6"); + register __m128i tmp4 asm("xmm7"); + register __m128i tmp5 asm("xmm8"); + register __m128i tmp6 asm("xmm9"); + register __m128i tmp7 asm("xmm10"); + register __m128i tmp8 asm("xmm11"); + __m128i pctr1[1]; + register __m128i XV asm("xmm2"); +#else + __m128i tmp1, tmp2; #endif - "movl %[nbytes], %%r13d\n\t" - "andl $0xfffffff0, %%r13d\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jge 14f\n\t" + if (ibytes == 12) + aes_gcm_avx1_calc_iv_12(KEY, ivec, nr, H, Y, T, X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T, X); - VAESENC_BLOCK_AVX2() - "addl $16, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jge 13f\n\t" - "vmovdqa %[MOD2_128], %%xmm0\n\t" - "\n" - "12:\n\t" - "vmovdqu (%[in],"VAR(KR64)",1), %%xmm9\n\t" - "vmovdqu "VAR(CTR1)", %%xmm5\n\t" - "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" - "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" - "vmovdqu %%xmm5, "VAR(CTR1)"\n\t" - VAESENC_GFMUL_SB_AVX2(%%xmm9, HR, XR, CTR1) - "vmovdqu %%xmm4, (%[out],"VAR(KR64)",1)\n\t" - "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" - "addl $16, "VAR(KR)"\n\t" - "vpxor %%xmm4, "VAR(XR)", "VAR(XR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 12b\n\t" - "\n" - "13:\n\t" - GHASH_GFMUL_RED_AVX2(XR, HR, XR) - "\n" - "14:\n\t" + for (i=0; i < (int)(abytes/16); i++) { + tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw_avx2(X, H); + } + if (abytes%16) { + last_block = _mm_setzero_si128(); + for (j=0; j < (int)(abytes%16); j++) + ((unsigned char*)&last_block)[j] = addt[i*16+j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw_avx2(X, H); + } - AESENC_LAST15_ENC_AVX2() - "\n" - "55:\n\t" + tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); + ctr1 = _mm_add_epi32(tmp1, ONE); + H = gfmul_shl1(H); - CALC_TAG_AVX2() - "vmovdqu %%xmm0, (%[tag])\n\t" - "addq $"VAR(STACK_OFFSET)", %%rsp\n\t" - "vzeroupper\n\t" +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) + i = 0; + if (nbytes >= 16*8) { + HT[0] = H; + HT[1] = gfmul_shifted_avx2(H, H); + HT[2] = gfmul_shifted_avx2(H, HT[1]); + HT[3] = gfmul_shifted_avx2(HT[1], HT[1]); + HT[4] = gfmul_shifted_avx2(HT[1], HT[2]); + HT[5] = gfmul_shifted_avx2(HT[2], HT[2]); + HT[6] = gfmul_shifted_avx2(HT[2], HT[3]); + HT[7] = gfmul_shifted_avx2(HT[3], HT[3]); - : - : [KEY] "r" (key), - [in] "r" (in), [out] "r" (out), [nr] "r" (nr), - [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), - [ivec] "r" (iv), [ibytes] "r" (ibytes), - [tag] "r" (tag), + pctr1[0] = ctr1; + __asm__ __volatile__ ( + VAESENC_CTR() + VAESENC_XOR() + VAESENC_SET(16) + VAESENC_SET(32) + VAESENC_SET(48) + VAESENC_SET(64) + VAESENC_SET(80) + VAESENC_SET(96) + VAESENC_SET(112) + VAESENC_SET(128) + VAESENC_SET(144) + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl 1f\n\t" + + VAESENC() + VAESENC_SET(176) + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl 1f\n\t" + + VAESENC() + VAESENC_SET(208) + "vmovaps 224(%[KEY]), %%xmm12\n\t" + "\n" + "1:\n\t" + VAESENC_LAST() + + : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), + [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), + [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8) + : [KEY] "r" (KEY), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), [TWO] "m" (TWO), + [THREE] "m" (THREE), [FOUR] "m" (FOUR), + [FIVE] "m" (FIVE), [SIX] "m" (SIX), + [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + + XV = X; + for (i=1; i < (int)(nbytes/16/8); i++) { + __asm__ __volatile__ ( + VAESENC_CTR() + VAESENC_XOR() + VAESENC_PCLMUL_AVX2_1(%[out], 16, -128, 112) + VAESENC_PCLMUL_AVX2_2(%[out], 32, -112, 96) + VAESENC_PCLMUL_AVX2_N(%[out], 48, -96, 80) + VAESENC_PCLMUL_AVX2_N(%[out], 64, -80, 64) + VAESENC_PCLMUL_AVX2_N(%[out], 80, -64, 48) + VAESENC_PCLMUL_AVX2_N(%[out], 96, -48, 32) + VAESENC_PCLMUL_AVX2_N(%[out], 112, -32, 16) + VAESENC_PCLMUL_AVX2_N(%[out], 128, -16, 0) + VAESENC_PCLMUL_AVX2_L(144) + + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + VAESENC() + VAESENC_SET(176) + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + VAESENC() + VAESENC_SET(208) + "vmovaps 224(%[KEY]), %%xmm12\n\t" + + "%=:\n\t" + VAESENC_LAST() + + : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), + [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), + [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8), + [XV] "+xr" (XV) + : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), [TWO] "m" (TWO), + [THREE] "m" (THREE), [FOUR] "m" (FOUR), + [FIVE] "m" (FIVE), [SIX] "m" (SIX), + [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT), + [MOD2_128] "m" (MOD2_128) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + } + X = XV; + ctr1 = pctr1[0]; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); + tmp1 = _mm_xor_si128(X, tmp1); + X = gfmul8_avx2(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, + HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); + } + for (k = i*8; k < (int)(nbytes/16); k++) { + __asm__ __volatile__ ( + VAESENC_BLOCK() + + "# Carryless Multiply X by H (128 x 128)\n\t" + "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" + "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" + "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" + "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "# Reduce\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "# End Reduce\n\t" + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), [BSWAP_MASK] "m" (BSWAP_MASK), [BSWAP_EPI64] "m" (BSWAP_EPI64), [ONE] "m" (ONE), -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) - [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), - [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), - [EIGHT] "m" (EIGHT), -#endif [MOD2_128] "m" (MOD2_128) - : "xmm15", "xmm14", "xmm13", "xmm12", - "xmm0", "xmm1", "xmm2", "xmm3", "memory", - "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", - "rbx", "rcx", "rdx", "r13" - ); + : "xmm15", "xmm14", "xmm13", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } +#else + for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { + __asm__ __volatile__ ( + VAESENC_BLOCK() + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), + [MOD2_128] "m" (MOD2_128) + : "memory" + ); + } + for (; k < (int)(nbytes/16); k++) { + __asm__ __volatile__ ( + "vpshufb %[BSWAP_EPI64], %[ctr1], %[tmp1]\n\t" + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" + "vpxor (%[KEY]), %[tmp1], %[tmp1]\n\t" + "vaesenc 16(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" + "vaesenc 32(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" + "vaesenc 48(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" + "vaesenc 64(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" + "vaesenc 80(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc 96(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vaesenc 112(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vaesenc 128(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vaesenc 144(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %[tmp2]\n\t" + "jl %=f\n\t" + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" + "vaesenc 176(%[KEY]), %[tmp1], %[tmp1]\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %[tmp2]\n\t" + "jl %=f\n\t" + "vaesenc %[tmp2], %[tmp1], %[tmp1]\n\t" + "vaesenc 208(%[KEY]), %[tmp1], %[tmp1]\n\t" + "vmovaps 224(%[KEY]), %[tmp2]\n\t" + "%=:\n\t" + "vaesenclast %[tmp2], %[tmp1], %[tmp1]\n\t" + "vpxor (%[in]), %[tmp1], %[tmp1]\n\t" + "vmovdqu %[tmp1], (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %[tmp1], %[tmp1]\n\t" + "vpxor %[tmp1], %[X], %[X]\n\t" + + : [tmp1] "+xr" (tmp1), [tmp2] "=xr" (tmp2), + [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), + [MOD2_128] "m" (MOD2_128) + : "xmm15", "xmm14", "xmm13", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } + if (k > 0) { + X = gfmul_shifted_avx2(X, H); + } +#endif + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + X = gfmul_shifted_avx2(X, H); + } + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted_avx2(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + _mm_storeu_si128((__m128i*)tag, T); } #endif /* HAVE_INTEL_AVX2 */ #endif /* HAVE_INTEL_AVX1 */ @@ -6352,374 +5429,810 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, #ifdef HAVE_AES_DECRYPT /* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */ -static void AES_GCM_decrypt(const unsigned char *in, unsigned char *out, - const unsigned char* addt, - const unsigned char* ivec, const unsigned char *tag, - int nbytes, int abytes, int ibytes, - const unsigned char* key, int nr, int* res) +static int AES_GCM_decrypt(const unsigned char *in, unsigned char *out, + const unsigned char* addt, const unsigned char* ivec, + const unsigned char *tag, int nbytes, int abytes, + int ibytes, const unsigned char* key, int nr) { - register const unsigned char* iv asm("rax") = ivec; - register int ivLen asm("ebx") = ibytes; - - __asm__ __volatile__ ( - "subq $"VAR(STACK_OFFSET)", %%rsp\n\t" - /* Counter is xmm13 */ - "pxor %%xmm13, %%xmm13\n\t" - "pxor %%xmm15, %%xmm15\n\t" - "movl %[ibytes], %%edx\n\t" - "cmpl $12, %%edx\n\t" - "jne 35f\n\t" - CALC_IV_12() - "\n" - "35:\n\t" - CALC_IV() - "\n" - "39:\n\t" - - CALC_AAD() - - "# Calculate counter and H\n\t" - "pshufb %[BSWAP_EPI64], %%xmm13\n\t" - "movdqa "VAR(HR)", %%xmm5\n\t" - "paddd %[ONE], %%xmm13\n\t" - "movdqa "VAR(HR)", %%xmm4\n\t" - "movdqu %%xmm13, "VAR(CTR1)"\n\t" - "psrlq $63, %%xmm5\n\t" - "psllq $1, %%xmm4\n\t" - "pslldq $8, %%xmm5\n\t" - "por %%xmm5, %%xmm4\n\t" - "pshufd $0xff, "VAR(HR)", "VAR(HR)"\n\t" - "psrad $31, "VAR(HR)"\n\t" - "pand %[MOD2_128], "VAR(HR)"\n\t" - "pxor %%xmm4, "VAR(HR)"\n\t" - - "xorl "VAR(KR)", "VAR(KR)"\n\t" - -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) - "cmpl $128, %[nbytes]\n\t" - "jl 5f\n\t" - - CALC_HT_8_AVX() - - "movl %[nbytes], %%r13d\n\t" - "andl $0xffffff80, %%r13d\n\t" - "\n" - "2:\n\t" - AESENC_128_GHASH_AVX(%%rcx, 128) - "addl $128, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 2b\n\t" - - "movdqa %%xmm2, "VAR(XR)"\n\t" - "movdqu (%%rsp), "VAR(HR)"\n\t" - "5:\n\t" - "movl %[nbytes], %%edx\n\t" - "cmpl %%edx, "VAR(KR)"\n\t" - "jge 55f\n\t" + int i, j ,k; + __m128i H, Y, T; + __m128i *KEY = (__m128i*)key, lastKey; + __m128i ctr1; + __m128i last_block = _mm_setzero_si128(); + __m128i X = _mm_setzero_si128(); + __m128i tmp1, tmp2, XV; +#ifndef AES_GCM_AESNI_NO_UNROLL + __m128i HT[8]; + __m128i r0, r1; + __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; #endif - "movl %[nbytes], %%r13d\n\t" - "andl $0xfffffff0, %%r13d\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jge 13f\n\t" - "\n" - "12:\n\t" - "leaq (%[in],"VAR(KR64)",1), %%rcx\n\t" - "leaq (%[out],"VAR(KR64)",1), %%rdx\n\t" - "movdqu (%%rcx), %%xmm1\n\t" - "movdqa "VAR(HR)", %%xmm0\n\t" - "pshufb %[BSWAP_MASK], %%xmm1\n\t" - "pxor "VAR(XR)", %%xmm1\n\t" - AESENC_GFMUL(%%rcx, %%rdx, %%xmm0, %%xmm1) - "addl $16, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 12b\n\t" - "\n" - "13:\n\t" + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T, X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T, X); - AESENC_LAST15_DEC_AVX() - "\n" - "55:\n\t" + for (i=0; i return 0\n\t" - "xorl %%eax, %%eax\n\t" - "cmpl $0xffff, %%edx\n\t" - "sete %%al\n\t" - "movl %%eax, (%[res])\n\t" - "addq $"VAR(STACK_OFFSET)", %%rsp\n\t" + tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); + ctr1 = _mm_add_epi32(tmp1, ONE); + H = gfmul_shl1(H); + i = 0; - : - : [KEY] "r" (key), - [in] "r" (in), [out] "r" (out), [nr] "r" (nr), - [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), - [ivec] "r" (iv), [ibytes] "r" (ivLen), - [tag] "r" (tag), [res] "r" (res), - [BSWAP_MASK] "m" (BSWAP_MASK), - [BSWAP_EPI64] "m" (BSWAP_EPI64), - [ONE] "m" (ONE), -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) - [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), - [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), - [EIGHT] "m" (EIGHT), +#ifndef AES_GCM_AESNI_NO_UNROLL + + if (0 < nbytes/16/8) { + HT[0] = H; + HT[1] = gfmul_shifted(H, H); + HT[2] = gfmul_shifted(H, HT[1]); + HT[3] = gfmul_shifted(HT[1], HT[1]); + HT[4] = gfmul_shifted(HT[1], HT[2]); + HT[5] = gfmul_shifted(HT[2], HT[2]); + HT[6] = gfmul_shifted(HT[2], HT[3]); + HT[7] = gfmul_shifted(HT[3], HT[3]); + + for (; i < nbytes/16/8; i++) { + r0 = _mm_setzero_si128(); + r1 = _mm_setzero_si128(); + + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp2 = _mm_add_epi32(ctr1, ONE); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); + tmp3 = _mm_add_epi32(ctr1, TWO); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); + tmp4 = _mm_add_epi32(ctr1, THREE); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); + tmp5 = _mm_add_epi32(ctr1, FOUR); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); + tmp6 = _mm_add_epi32(ctr1, FIVE); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); + tmp7 = _mm_add_epi32(ctr1, SIX); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); + tmp8 = _mm_add_epi32(ctr1, SEVEN); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, EIGHT); + tmp1 =_mm_xor_si128(tmp1, KEY[0]); + tmp2 =_mm_xor_si128(tmp2, KEY[0]); + tmp3 =_mm_xor_si128(tmp3, KEY[0]); + tmp4 =_mm_xor_si128(tmp4, KEY[0]); + tmp5 =_mm_xor_si128(tmp5, KEY[0]); + tmp6 =_mm_xor_si128(tmp6, KEY[0]); + tmp7 =_mm_xor_si128(tmp7, KEY[0]); + tmp8 =_mm_xor_si128(tmp8, KEY[0]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[i*8+0]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + gfmul_only(XV, HT[7], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[i*8+1]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[6], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[i*8+2]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[5], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[i*8+3]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[4], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[i*8+4]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[3], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[i*8+5]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[2], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[i*8+6]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[1], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[i*8+7]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[0], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); + /* Reduction */ + X = ghash_red(r0, r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); + lastKey = KEY[14]; + } + } + AES_ENC_LAST_8(); + } + } #endif - [MOD2_128] "m" (MOD2_128) - : "xmm15", "xmm14", "xmm13", "xmm12", - "xmm0", "xmm1", "xmm2", "xmm3", "memory", - "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", - "rcx", "rdx", "r13" - ); + for (k = i*8; k < nbytes/16; k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[k]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted(XV, H); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp2 = _mm_loadu_si128(&((__m128i*)in)[k]); + tmp1 = _mm_xor_si128(tmp1, tmp2); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + } + + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = _mm_setzero_si128(); + for (j=0; j < nbytes%16; j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + XV = last_block; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < nbytes%16; j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted(XV, H); + } + + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + /* 128 x 128 Carryless Multiply */ + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + + if (0xffff != + _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) + return 0; /* in case the authentication failed */ + + return 1; /* when successful returns 1 */ } #ifdef HAVE_INTEL_AVX1 -static void AES_GCM_decrypt_avx1(const unsigned char *in, unsigned char *out, - const unsigned char* addt, - const unsigned char* ivec, - const unsigned char *tag, int nbytes, - int abytes, int ibytes, - const unsigned char* key, int nr, int* res) +static int AES_GCM_decrypt_avx1(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + const unsigned char *tag, int nbytes, + int abytes, int ibytes, + const unsigned char* key, int nr) { - register const unsigned char* iv asm("rax") = ivec; - register int ivLen asm("ebx") = ibytes; + int i, j ,k; + __m128i H, Y, T; + __m128i *KEY = (__m128i*)key, lastKey; + __m128i ctr1; + __m128i last_block = _mm_setzero_si128(); + __m128i X = _mm_setzero_si128(); +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) + __m128i HT[8]; + __m128i pctr1[1]; + register __m128i XV asm("xmm2"); + register __m128i tmp1 asm("xmm4"); + register __m128i tmp2 asm("xmm5"); + register __m128i tmp3 asm("xmm6"); + register __m128i tmp4 asm("xmm7"); + register __m128i tmp5 asm("xmm8"); + register __m128i tmp6 asm("xmm9"); + register __m128i tmp7 asm("xmm10"); + register __m128i tmp8 asm("xmm11"); +#else + __m128i XV; + __m128i tmp1; +#endif - __asm__ __volatile__ ( - "subq $"VAR(STACK_OFFSET)", %%rsp\n\t" - /* Counter is xmm13 */ - "vpxor %%xmm13, %%xmm13, %%xmm13\n\t" - "vpxor %%xmm15, %%xmm15, %%xmm15\n\t" - "movl %[ibytes], %%edx\n\t" - "cmpl $12, %%edx\n\t" - "jne 35f\n\t" - CALC_IV_12_AVX1() - "\n" - "35:\n\t" - CALC_IV_AVX1() - "\n" - "39:\n\t" + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T, X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T, X); - CALC_AAD_AVX1() + for (i=0; i return 0\n\t" - "xorl %%eax, %%eax\n\t" - "cmpl $0xffff, %%edx\n\t" - "sete %%al\n\t" - "movl %%eax, (%[res])\n\t" - "addq $"VAR(STACK_OFFSET)", %%rsp\n\t" - "vzeroupper\n\t" - - : - : [KEY] "r" (key), - [in] "r" (in), [out] "r" (out), [nr] "r" (nr), - [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), - [ivec] "r" (iv), [ibytes] "r" (ivLen), - [tag] "r" (tag), [res] "r" (res), + : [H] "+xr" (H), [X] "+xr" (X), + [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), [BSWAP_MASK] "m" (BSWAP_MASK), [BSWAP_EPI64] "m" (BSWAP_EPI64), [ONE] "m" (ONE), -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) - [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), - [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), - [EIGHT] "m" (EIGHT), -#endif [MOD2_128] "m" (MOD2_128) - : "xmm15", "xmm14", "xmm13", "xmm12", - "xmm0", "xmm1", "xmm2", "xmm3", "memory", - "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", - "rcx", "rdx", "r13" - ); + : "xmm15", "xmm14", "xmm13", "xmm4", "xmm5", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } + + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = _mm_setzero_si128(); + for (j=0; j < nbytes%16; j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + XV = last_block; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < nbytes%16; j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted(XV, H); + } + + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + /* 128 x 128 Carryless Multiply */ + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + + if (0xffff != + _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) + return 0; /* in case the authentication failed */ + + return 1; /* when successful returns 1 */ } #ifdef HAVE_INTEL_AVX2 -static void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, - const unsigned char* addt, - const unsigned char* ivec, - const unsigned char *tag, int nbytes, - int abytes, int ibytes, - const unsigned char* key, int nr, int* res) +static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + const unsigned char *tag, int nbytes, + int abytes, int ibytes, + const unsigned char* key, int nr) { - register const unsigned char* iv asm("rax") = ivec; - register int ivLen asm("ebx") = ibytes; - - __asm__ __volatile__ ( - "subq $"VAR(STACK_OFFSET)", %%rsp\n\t" - /* Counter is xmm13 */ - "vpxor %%xmm13, %%xmm13, %%xmm13\n\t" - "vpxor %%xmm15, %%xmm15, %%xmm15\n\t" - "movl %[ibytes], %%edx\n\t" - "cmpl $12, %%edx\n\t" - "jne 35f\n\t" - CALC_IV_12_AVX2() - "jmp 39f\n\t" - "\n" - "35:\n\t" - CALC_IV_AVX2() - "\n" - "39:\n\t" - - CALC_AAD_AVX2() - - "# Calculate counter and H\n\t" - "vpsrlq $63, "VAR(HR)", %%xmm5\n\t" - "vpsllq $1, "VAR(HR)", %%xmm4\n\t" - "vpslldq $8, %%xmm5, %%xmm5\n\t" - "vpor %%xmm5, %%xmm4, %%xmm4\n\t" - "vpshufd $0xff, "VAR(HR)", "VAR(HR)"\n\t" - "vpsrad $31, "VAR(HR)", "VAR(HR)"\n\t" - "vpshufb %[BSWAP_EPI64], %%xmm13, %%xmm13\n\t" - "vpand %[MOD2_128], "VAR(HR)", "VAR(HR)"\n\t" - "vpaddd %[ONE], %%xmm13, %%xmm13\n\t" - "vpxor %%xmm4, "VAR(HR)", "VAR(HR)"\n\t" - "vmovdqu %%xmm13, "VAR(CTR1)"\n\t" - - "xorl "VAR(KR)", "VAR(KR)"\n\t" - -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) - "cmpl $128, %[nbytes]\n\t" - "jl 5f\n\t" - - CALC_HT_8_AVX2() - - "movl %[nbytes], %%r13d\n\t" - "andl $0xffffff80, %%r13d\n\t" - "\n" - "2:\n\t" - VAESENC_128_GHASH_AVX2(%%rcx, 128) - "addl $128, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 2b\n\t" - - "vmovdqa %%xmm2, "VAR(XR)"\n\t" - "vmovdqu (%%rsp), "VAR(HR)"\n\t" - "5:\n\t" - "movl %[nbytes], %%edx\n\t" - "cmpl %%edx, "VAR(KR)"\n\t" - "jge 55f\n\t" + int i, j ,k; + __m128i H, Y, T; + __m128i *KEY = (__m128i*)key, lastKey; + __m128i ctr1; + __m128i last_block = _mm_setzero_si128(); + __m128i X = _mm_setzero_si128(); +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) + __m128i HT[8]; + __m128i pctr1[1]; + register __m128i XV asm("xmm2"); + register __m128i tmp1 asm("xmm4"); + register __m128i tmp2 asm("xmm5"); + register __m128i tmp3 asm("xmm6"); + register __m128i tmp4 asm("xmm7"); + register __m128i tmp5 asm("xmm8"); + register __m128i tmp6 asm("xmm9"); + register __m128i tmp7 asm("xmm10"); + register __m128i tmp8 asm("xmm11"); +#else + __m128i XV; + __m128i tmp1; #endif - "movl %[nbytes], %%r13d\n\t" - "andl $0xfffffff0, %%r13d\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jge 13f\n\t" - "vmovdqa %[MOD2_128], %%xmm0\n\t" - "\n" - "12:\n\t" - "vmovdqu (%[in],"VAR(KR64)",1), %%xmm9\n\t" - "vmovdqu "VAR(CTR1)", %%xmm5\n\t" - "vpshufb %[BSWAP_MASK], %%xmm9, %%xmm1\n\t" - "vpshufb %[BSWAP_EPI64], %%xmm5, %%xmm4\n\t" - "vpaddd %[ONE], %%xmm5, %%xmm5\n\t" - "vpxor "VAR(XR)", %%xmm1, %%xmm1\n\t" - "vmovdqu %%xmm5, "VAR(CTR1)"\n\t" - VAESENC_GFMUL_SB_AVX2(%%xmm9, HR, %%xmm1, CTR1) - "vmovdqu %%xmm4, (%[out],"VAR(KR64)",1)\n\t" - "addl $16, "VAR(KR)"\n\t" - "cmpl %%r13d, "VAR(KR)"\n\t" - "jl 12b\n\t" - "\n" - "13:\n\t" + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T, X); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T, X); - AESENC_LAST15_DEC_AVX2() - "\n" - "55:\n\t" + for (i=0; i return 0\n\t" - "xorl %%eax, %%eax\n\t" - "cmpl $0xffff, %%edx\n\t" - "sete %%al\n\t" - "movl %%eax, (%[res])\n\t" - "addq $"VAR(STACK_OFFSET)", %%rsp\n\t" - "vzeroupper\n\t" + tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); + ctr1 = _mm_add_epi32(tmp1, ONE); + H = gfmul_shl1(H); + i = 0; - : - : [KEY] "r" (key), - [in] "r" (in), [out] "r" (out), [nr] "r" (nr), - [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt), - [ivec] "r" (iv), [ibytes] "r" (ivLen), - [tag] "r" (tag), [res] "r" (res), +#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL) + if (0 < nbytes/16/8) { + HT[0] = H; + HT[1] = gfmul_shifted_avx2(H, H); + HT[2] = gfmul_shifted_avx2(H, HT[1]); + HT[3] = gfmul_shifted_avx2(HT[1], HT[1]); + HT[4] = gfmul_shifted_avx2(HT[1], HT[2]); + HT[5] = gfmul_shifted_avx2(HT[2], HT[2]); + HT[6] = gfmul_shifted_avx2(HT[2], HT[3]); + HT[7] = gfmul_shifted_avx2(HT[3], HT[3]); + + pctr1[0] = ctr1; + XV = X; + for (; i < nbytes/16/8; i++) { + __asm__ __volatile__ ( + VAESENC_CTR() + VAESENC_XOR() + VAESENC_PCLMUL_AVX2_1(%[in], 16, 0, 112) + VAESENC_PCLMUL_AVX2_2(%[in], 32, 16, 96) + VAESENC_PCLMUL_AVX2_N(%[in], 48, 32, 80) + VAESENC_PCLMUL_AVX2_N(%[in], 64, 48, 64) + VAESENC_PCLMUL_AVX2_N(%[in], 80, 64, 48) + VAESENC_PCLMUL_AVX2_N(%[in], 96, 80, 32) + VAESENC_PCLMUL_AVX2_N(%[in], 112, 96, 16) + VAESENC_PCLMUL_AVX2_N(%[in], 128, 112, 0) + VAESENC_PCLMUL_AVX2_L(144) + + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + VAESENC() + VAESENC_SET(176) + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm12\n\t" + "jl %=f\n\t" + + VAESENC() + VAESENC_SET(208) + "vmovaps 224(%[KEY]), %%xmm12\n\t" + + "%=:\n\t" + VAESENC_LAST() + + : [tmp1] "=xr" (tmp1), [tmp2] "=xr" (tmp2), [tmp3] "=xr" (tmp3), + [tmp4] "=xr" (tmp4), [tmp5] "=xr" (tmp5), [tmp6] "=xr" (tmp6), + [tmp7] "=xr" (tmp7), [tmp8] "=xr" (tmp8), + [XV] "+xr" (XV) + : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1), + [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), + [BSWAP_MASK] "m" (BSWAP_MASK), + [BSWAP_EPI64] "m" (BSWAP_EPI64), + [ONE] "m" (ONE), [TWO] "m" (TWO), + [THREE] "m" (THREE), [FOUR] "m" (FOUR), + [FIVE] "m" (FIVE), [SIX] "m" (SIX), + [SEVEN] "m" (SEVEN), [EIGHT] "m" (EIGHT), + [MOD2_128] "m" (MOD2_128) + : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm0", "xmm1", "xmm3", "memory" + ); + } + X = XV; + ctr1 = pctr1[0]; + } +#endif + for (k = i*8; k < nbytes/16; k++) { + __asm__ __volatile__ ( + "vpshufb %[BSWAP_EPI64], %[ctr1], %%xmm4\n\t" + "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" + "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" + "vmovaps %[H], %%xmm0\n\t" + "vmovdqu (%[in]), %%xmm1\n\t" + "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" + "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" + "vpxor %[X], %%xmm1, %%xmm1\n\t" + "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" + "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" + "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" + "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" + "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" + "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" + "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" + "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" + "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpslldq $8, %%xmm13, %%xmm2\n\t" + "vpsrldq $8, %%xmm13, %%xmm13\n\t" + "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" + "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" + "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" + "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" + "# Reduce\n\t" + "vmovdqa %[MOD2_128], %%xmm0\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" + "vpshufd $78, %%xmm2, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" + "vpshufd $78, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" + "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" + "vmovdqa %%xmm13, %[X]\n\t" + "# End Reduce\n\t" + "cmpl $11, %[nr]\n\t" + "vmovaps 160(%[KEY]), %%xmm5\n\t" + "jl %=f\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" + "cmpl $13, %[nr]\n\t" + "vmovaps 192(%[KEY]), %%xmm5\n\t" + "jl %=f\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" + "vmovaps 224(%[KEY]), %%xmm5\n\t" + "%=:\n\t" + "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" + "vpxor (%[in]), %%xmm4, %%xmm4\n\t" + "vmovdqu %%xmm4, (%[out])\n\t" + + : [H] "+xr" (H), [X] "+xr" (X), + [ctr1] "+xr" (ctr1) + : [KEY] "r" (KEY), + [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), [BSWAP_MASK] "m" (BSWAP_MASK), [BSWAP_EPI64] "m" (BSWAP_EPI64), [ONE] "m" (ONE), -#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL) - [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR), - [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN), - [EIGHT] "m" (EIGHT), -#endif [MOD2_128] "m" (MOD2_128) - : "xmm15", "xmm14", "xmm13", "xmm12", - "xmm0", "xmm1", "xmm2", "xmm3", "memory", - "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", - "rcx", "rdx", "r13" - ); + : "xmm15", "xmm14", "xmm13", "xmm4", "xmm5", + "xmm0", "xmm1", "xmm2", "xmm3", "memory" + ); + } + + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = _mm_setzero_si128(); + for (j=0; j < nbytes%16; j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + XV = last_block; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < nbytes%16; j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted_avx2(XV, H); + } + + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + /* 128 x 128 Carryless Multiply */ + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted_avx2(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + + if (0xffff != + _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) + return 0; /* in case the authentication failed */ + + return 1; /* when successful returns 1 */ } #endif /* HAVE_INTEL_AVX2 */ #endif /* HAVE_INTEL_AVX1 */ @@ -7234,14 +6747,36 @@ void GHASH(Aes* aes, const byte* a, word32 aSz, const byte* c, #if !defined(WOLFSSL_XILINX_CRYPT) -#ifdef FREESCALE_LTC_AES_GCM int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { - status_t status; + int ret = 0; word32 keySize; +#ifdef FREESCALE_LTC_AES_GCM + status_t status; +#else + word32 blocks = sz / AES_BLOCK_SIZE; + word32 partial = sz % AES_BLOCK_SIZE; + const byte* p = in; + byte* c = out; + byte counter[AES_BLOCK_SIZE]; + byte initialCounter[AES_BLOCK_SIZE]; + byte *ctr; + byte scratch[AES_BLOCK_SIZE]; +#if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || defined(WOLFSSL_STM32F7)) + #ifdef WOLFSSL_STM32_CUBEMX + CRYP_HandleTypeDef hcryp; + #else + byte keyCopy[AES_BLOCK_SIZE * 2]; + #endif /* WOLFSSL_STM32_CUBEMX */ + int status = 0; + byte* authInPadded = NULL; + byte tag[AES_BLOCK_SIZE]; + int authPadSz; +#endif /* STM32_CRYPTO */ +#endif /* FREESCALE_LTC_AES_GCM */ /* argument checks */ if (aes == NULL || authTagSz > AES_BLOCK_SIZE) { @@ -7257,133 +6792,158 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, if (ret != 0) return ret; +#ifdef FREESCALE_LTC_AES_GCM + status = LTC_AES_EncryptTagGcm(LTC_BASE, in, out, sz, iv, ivSz, authIn, authInSz, (byte*)aes->key, keySize, authTag, authTagSz); - return (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E; -} -#else -#if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \ - defined(WOLFSSL_STM32F7)) -static INLINE int wc_AesGcmEncrypt_STM32(Aes* aes, byte* out, const byte* in, - word32 sz, const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - word32 keySize; - byte initialCounter[AES_BLOCK_SIZE]; - #ifdef WOLFSSL_STM32_CUBEMX - CRYP_HandleTypeDef hcryp; - #else - byte keyCopy[AES_BLOCK_SIZE * 2]; - #endif /* WOLFSSL_STM32_CUBEMX */ - int status = 0; - byte* authInPadded = NULL; - byte tag[AES_BLOCK_SIZE]; - int authPadSz; + ret = (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E; - ret = wc_AesGetKeySize(aes, &keySize); - if (ret != 0) - return ret; +#else + +#if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || defined(WOLFSSL_STM32F7)) + + /* additional argument checks - STM32 HW only supports 12 byte IV */ + if (ivSz != NONCE_SZ) { + return BAD_FUNC_ARG; + } XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); XMEMCPY(initialCounter, iv, ivSz); initialCounter[AES_BLOCK_SIZE - 1] = STM32_GCM_IV_START; - /* pad authIn if it is not a block multiple */ - if ((authInSz % AES_BLOCK_SIZE) != 0) { - authPadSz = ((authInSz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; - /* Need to pad the AAD to a full block with zeros. */ - authInPadded = XMALLOC(authPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); - if (authInPadded == NULL) { - return MEMORY_E; + /* STM32 HW AES-GCM requires / assumes inputs are a multiple of block size. + * We can avoid this by zero padding (authIn) AAD, but zero-padded plaintext + * will be encrypted and output incorrectly, causing a bad authTag. + * We will use HW accelerated AES-GCM if plain%AES_BLOCK_SZ==0. + * Otherwise, we will use accelerated AES_CTR for encrypt, and then + * perform GHASH in software. + * See NIST SP 800-38D */ + + /* Plain text is a multiple of block size, so use HW-Accelerated AES_GCM */ + if (!partial) { + /* pad authIn if it is not a block multiple */ + if ((authInSz % AES_BLOCK_SIZE) != 0) { + authPadSz = ((authInSz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE; + /* Need to pad the AAD to a full block with zeros. */ + authInPadded = XMALLOC(authPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); + if (authInPadded == NULL) { + return MEMORY_E; + } + XMEMSET(authInPadded, 0, authPadSz); + XMEMCPY(authInPadded, authIn, authInSz); + } else { + authPadSz = authInSz; + authInPadded = (byte*)authIn; } - XMEMSET(authInPadded, 0, authPadSz); - XMEMCPY(authInPadded, authIn, authInSz); - } else { - authPadSz = authInSz; - authInPadded = (byte*)authIn; + + + #ifdef WOLFSSL_STM32_CUBEMX + XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); + switch (keySize) { + case 16: /* 128-bit key */ + hcryp.Init.KeySize = CRYP_KEYSIZE_128B; + break; + case 24: /* 192-bit key */ + hcryp.Init.KeySize = CRYP_KEYSIZE_192B; + break; + case 32: /* 256-bit key */ + hcryp.Init.KeySize = CRYP_KEYSIZE_256B; + break; + default: + break; + } + hcryp.Instance = CRYP; + hcryp.Init.DataType = CRYP_DATATYPE_8B; + hcryp.Init.pKey = (byte*)aes->key; + hcryp.Init.pInitVect = initialCounter; + hcryp.Init.Header = authInPadded; + hcryp.Init.HeaderSize = authInSz; + + HAL_CRYP_Init(&hcryp); + status = HAL_CRYPEx_AESGCM_Encrypt(&hcryp, (byte*)in, sz, + out, STM32_HAL_TIMEOUT); + /* Compute the authTag */ + if (status == HAL_OK) + status = HAL_CRYPEx_AESGCM_Finish(&hcryp, sz, tag, STM32_HAL_TIMEOUT); + + if (status != HAL_OK) + ret = AES_GCM_AUTH_E; + HAL_CRYP_DeInit(&hcryp); + #else + ByteReverseWords((word32*)keyCopy, (word32*)aes->key, keySize); + status = CRYP_AES_GCM(MODE_ENCRYPT, (uint8_t*)initialCounter, + (uint8_t*)keyCopy, keySize * 8, + (uint8_t*)in, sz, + (uint8_t*)authInPadded,authInSz, + (uint8_t*)out, tag); + if (status != SUCCESS) + ret = AES_GCM_AUTH_E; + #endif /* WOLFSSL_STM32_CUBEMX */ + + /* authTag may be shorter than AES_BLOCK_SZ, store separately */ + if (ret == 0) + XMEMCPY(authTag, tag, authTagSz); + + /* We only allocate extra memory if authInPadded is not a multiple of AES_BLOCK_SZ */ + if (authInPadded != NULL && authInSz != authPadSz) { + XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); + } + + return ret; } +#endif -#ifdef WOLFSSL_STM32_CUBEMX - XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef)); - switch (keySize) { - case 16: /* 128-bit key */ - hcryp.Init.KeySize = CRYP_KEYSIZE_128B; - break; - case 24: /* 192-bit key */ - hcryp.Init.KeySize = CRYP_KEYSIZE_192B; - break; - case 32: /* 256-bit key */ - hcryp.Init.KeySize = CRYP_KEYSIZE_256B; - break; - default: - break; + /* Software AES-GCM */ + +#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) + /* if async and byte count above threshold */ + if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES && + sz >= WC_ASYNC_THRESH_AES_GCM) { + #if defined(HAVE_CAVIUM) + /* Not yet supported, contact wolfSSL if interested in using */ + #elif defined(HAVE_INTEL_QA) + return IntelQaSymAesGcmEncrypt(&aes->asyncDev, out, in, sz, + (const byte*)aes->asyncKey, aes->keylen, iv, ivSz, + authTag, authTagSz, authIn, authInSz); + #else /* WOLFSSL_ASYNC_CRYPT_TEST */ + if (wc_AsyncTestInit(&aes->asyncDev, ASYNC_TEST_AES_GCM_ENCRYPT)) { + WC_ASYNC_TEST* testDev = &aes->asyncDev.test; + testDev->aes.aes = aes; + testDev->aes.out = out; + testDev->aes.in = in; + testDev->aes.sz = sz; + testDev->aes.iv = iv; + testDev->aes.ivSz = ivSz; + testDev->aes.authTag = authTag; + testDev->aes.authTagSz = authTagSz; + testDev->aes.authIn = authIn; + testDev->aes.authInSz = authInSz; + return WC_PENDING_E; + } + #endif } - hcryp.Instance = CRYP; - hcryp.Init.DataType = CRYP_DATATYPE_8B; - hcryp.Init.pKey = (byte*)aes->key; - hcryp.Init.pInitVect = initialCounter; - hcryp.Init.Header = authInPadded; - hcryp.Init.HeaderSize = authInSz; - - HAL_CRYP_Init(&hcryp); - status = HAL_CRYPEx_AESGCM_Encrypt(&hcryp, (byte*)in, sz, - out, STM32_HAL_TIMEOUT); - /* Compute the authTag */ - if (status == HAL_OK) - status = HAL_CRYPEx_AESGCM_Finish(&hcryp, sz, tag, STM32_HAL_TIMEOUT); - - if (status != HAL_OK) - ret = AES_GCM_AUTH_E; - HAL_CRYP_DeInit(&hcryp); -#else - ByteReverseWords((word32*)keyCopy, (word32*)aes->key, keySize); - status = CRYP_AES_GCM(MODE_ENCRYPT, (uint8_t*)initialCounter, - (uint8_t*)keyCopy, keySize * 8, - (uint8_t*)in, sz, - (uint8_t*)authInPadded,authInSz, - (uint8_t*)out, tag); - if (status != SUCCESS) - ret = AES_GCM_AUTH_E; -#endif /* WOLFSSL_STM32_CUBEMX */ - - /* authTag may be shorter than AES_BLOCK_SZ, store separately */ - if (ret == 0) - XMEMCPY(authTag, tag, authTagSz); - - /* We only allocate extra memory if authInPadded is not a multiple of AES_BLOCK_SZ */ - if (authInPadded != NULL && authInSz != authPadSz) { - XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); - } - - return ret; -} -#endif /* STM32_CRYPTO */ +#endif /* WOLFSSL_ASYNC_CRYPT */ #ifdef WOLFSSL_AESNI -int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz); -#else -static + if (haveAESNI) { + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX2(intel_flags)) { + AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, + sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); + } + else if (IS_INTEL_AVX1(intel_flags)) { + AES_GCM_encrypt_avx1(in, out, authIn, iv, authTag, + sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); + } + else + #endif + AES_GCM_encrypt(in, out, authIn, iv, authTag, + sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); + return 0; + } #endif -int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - int ret = 0; - word32 blocks = sz / AES_BLOCK_SIZE; - word32 partial = sz % AES_BLOCK_SIZE; - const byte* p = in; - byte* c = out; - byte counter[AES_BLOCK_SIZE]; - byte initialCounter[AES_BLOCK_SIZE]; - byte *ctr; - byte scratch[AES_BLOCK_SIZE]; ctr = counter; XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); @@ -7429,7 +6989,6 @@ int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, } else #endif /* HAVE_AES_ECB */ - while (blocks--) { IncrementGcmCounter(ctr); #ifndef WOLFSSL_PIC32MZ_CRYPT @@ -7446,151 +7005,30 @@ int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, wc_AesEncrypt(aes, ctr, scratch); xorbuf(scratch, p, partial); XMEMCPY(c, scratch, partial); + } GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz); wc_AesEncrypt(aes, initialCounter, scratch); xorbuf(authTag, scratch, authTagSz); +#endif /* FREESCALE_LTC_AES_GCM */ + return ret; } -int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - /* argument checks */ - if (aes == NULL || authTagSz > AES_BLOCK_SIZE) { - return BAD_FUNC_ARG; - } - - if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) { - WOLFSSL_MSG("GcmEncrypt authTagSz too small error"); - return BAD_FUNC_ARG; - } - -#if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \ - defined(WOLFSSL_STM32F7)) - - /* additional argument checks - STM32 HW only supports 12 byte IV */ - if (ivSz != NONCE_SZ) { - return BAD_FUNC_ARG; - } - - /* STM32 HW AES-GCM requires / assumes inputs are a multiple of block size. - * We can avoid this by zero padding (authIn) AAD, but zero-padded plaintext - * will be encrypted and output incorrectly, causing a bad authTag. - * We will use HW accelerated AES-GCM if plain%AES_BLOCK_SZ==0. - * Otherwise, we will use accelerated AES_CTR for encrypt, and then - * perform GHASH in software. - * See NIST SP 800-38D */ - - /* Plain text is a multiple of block size, so use HW-Accelerated AES_GCM */ - if (sz % AES_BLOCK_SIZE == 0) { - return wc_AesGcmEncrypt_STM32(aes, out, in, sz, iv, ivSz, - authTag, authTagSz, authIn, authInSz); - } -#endif - -#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) - /* if async and byte count above threshold */ - if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES && - sz >= WC_ASYNC_THRESH_AES_GCM) { - #if defined(HAVE_CAVIUM) - /* Not yet supported, contact wolfSSL if interested in using */ - #elif defined(HAVE_INTEL_QA) - return IntelQaSymAesGcmEncrypt(&aes->asyncDev, out, in, sz, - (const byte*)aes->asyncKey, aes->keylen, iv, ivSz, - authTag, authTagSz, authIn, authInSz); - #else /* WOLFSSL_ASYNC_CRYPT_TEST */ - if (wc_AsyncTestInit(&aes->asyncDev, ASYNC_TEST_AES_GCM_ENCRYPT)) { - WC_ASYNC_TEST* testDev = &aes->asyncDev.test; - testDev->aes.aes = aes; - testDev->aes.out = out; - testDev->aes.in = in; - testDev->aes.sz = sz; - testDev->aes.iv = iv; - testDev->aes.ivSz = ivSz; - testDev->aes.authTag = authTag; - testDev->aes.authTagSz = authTagSz; - testDev->aes.authIn = authIn; - testDev->aes.authInSz = authInSz; - return WC_PENDING_E; - } - #endif - } -#endif /* WOLFSSL_ASYNC_CRYPT */ - - /* Software AES-GCM */ - -#ifdef WOLFSSL_AESNI - #ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_AVX2(intel_flags)) { - AES_GCM_encrypt_avx2(in, out, authIn, iv, authTag, - sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); - return 0; - } - else - #endif - #ifdef HAVE_INTEL_AVX1 - if (IS_INTEL_AVX1(intel_flags)) { - AES_GCM_encrypt_avx1(in, out, authIn, iv, authTag, - sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); - return 0; - } - else - #endif - if (haveAESNI) { - AES_GCM_encrypt(in, out, authIn, iv, authTag, - sz, authInSz, ivSz, (const byte*)aes->key, aes->rounds); - return 0; - } - else -#endif - { - return AES_GCM_encrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz, - authIn, authInSz); - } -} -#endif - #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AESGCM_DECRYPT) +int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, + const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + int ret = 0; + word32 keySize; #ifdef FREESCALE_LTC_AES_GCM -int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - int ret = 0; - word32 keySize; status_t status; - - /* argument checks */ - if (aes == NULL || out == NULL || in == NULL || iv == NULL || - authTag == NULL || authTagSz > AES_BLOCK_SIZE) { - return BAD_FUNC_ARG; - } - - ret = wc_AesGetKeySize(aes, &keySize); - if (ret != 0) { - return ret; - } - - status = LTC_AES_DecryptTagGcm(LTC_BASE, in, out, sz, iv, ivSz, - authIn, authInSz, (byte*)aes->key, keySize, authTag, authTagSz); - - return (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E; -} #elif defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || defined(WOLFSSL_STM32F7)) -int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - int ret = 0; - word32 keySize; #ifdef WOLFSSL_STM32_CUBEMX CRYP_HandleTypeDef hcryp; #else @@ -7602,6 +7040,18 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, byte *inPadded = NULL; byte *authInPadded = NULL; byte initialCounter[AES_BLOCK_SIZE]; +#else /* software AES-GCM */ + word32 blocks = sz / AES_BLOCK_SIZE; + word32 partial = sz % AES_BLOCK_SIZE; + const byte* c = in; + byte* p = out; + byte counter[AES_BLOCK_SIZE]; + byte initialCounter[AES_BLOCK_SIZE]; + byte *ctr; + byte scratch[AES_BLOCK_SIZE]; + byte Tprime[AES_BLOCK_SIZE]; + byte EKY0[AES_BLOCK_SIZE]; +#endif /* argument checks */ if (aes == NULL || out == NULL || in == NULL || iv == NULL || @@ -7614,6 +7064,15 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, return ret; } +#ifdef FREESCALE_LTC_AES_GCM + + status = LTC_AES_DecryptTagGcm(LTC_BASE, in, out, sz, iv, ivSz, + authIn, authInSz, (byte*)aes->key, keySize, authTag, authTagSz); + + ret = (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E; + +#elif defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || defined(WOLFSSL_STM32F7)) + /* additional argument checks - STM32 HW only supports 12 byte IV */ if (ivSz != NONCE_SZ) { return BAD_FUNC_ARG; @@ -7719,35 +7178,62 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, if (authInPadded != NULL && authPadSz != authInSz) XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER); - return ret; -} #else -#ifdef WOLFSSL_AESNI -int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz); -#else -static -#endif -int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - int ret = 0; - word32 blocks = sz / AES_BLOCK_SIZE; - word32 partial = sz % AES_BLOCK_SIZE; - const byte* c = in; - byte* p = out; - byte counter[AES_BLOCK_SIZE]; - byte initialCounter[AES_BLOCK_SIZE]; - byte *ctr; - byte scratch[AES_BLOCK_SIZE]; - byte Tprime[AES_BLOCK_SIZE]; - byte EKY0[AES_BLOCK_SIZE]; - ctr = counter; + /* software AES GCM */ + +#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) + /* if async and byte count above threshold */ + if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES && + sz >= WC_ASYNC_THRESH_AES_GCM) { + #if defined(HAVE_CAVIUM) + /* Not yet supported, contact wolfSSL if interested in using */ + #elif defined(HAVE_INTEL_QA) + return IntelQaSymAesGcmDecrypt(&aes->asyncDev, out, in, sz, + (const byte*)aes->asyncKey, aes->keylen, iv, ivSz, + authTag, authTagSz, authIn, authInSz); + #else /* WOLFSSL_ASYNC_CRYPT_TEST */ + if (wc_AsyncTestInit(&aes->asyncDev, ASYNC_TEST_AES_GCM_DECRYPT)) { + WC_ASYNC_TEST* testDev = &aes->asyncDev.test; + testDev->aes.aes = aes; + testDev->aes.out = out; + testDev->aes.in = in; + testDev->aes.sz = sz; + testDev->aes.iv = iv; + testDev->aes.ivSz = ivSz; + testDev->aes.authTag = (byte*)authTag; + testDev->aes.authTagSz = authTagSz; + testDev->aes.authIn = authIn; + testDev->aes.authInSz = authInSz; + return WC_PENDING_E; + } + #endif + } +#endif /* WOLFSSL_ASYNC_CRYPT */ + +#ifdef WOLFSSL_AESNI + if (haveAESNI) { + #ifdef HAVE_INTEL_AVX1 + if (IS_INTEL_AVX2(intel_flags)) { + if (AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, + ivSz, (byte*)aes->key, aes->rounds) == 0) + return AES_GCM_AUTH_E; + } + else if (IS_INTEL_AVX1(intel_flags)) { + if (AES_GCM_decrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, + ivSz, (byte*)aes->key, aes->rounds) == 0) + return AES_GCM_AUTH_E; + } + else + #endif + if (AES_GCM_decrypt(in, out, authIn, iv, authTag, sz, authInSz, ivSz, + (byte*)aes->key, aes->rounds) == 0) + return AES_GCM_AUTH_E; + return 0; + } +#endif + + ctr = counter; XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); if (ivSz == NONCE_SZ) { XMEMCPY(initialCounter, iv, ivSz); @@ -7817,92 +7303,11 @@ int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz, xorbuf(scratch, c, partial); XMEMCPY(p, scratch, partial); } +#endif return ret; } -int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ -#ifdef WOLFSSL_AESNI - int res; -#endif - - /* argument checks */ - if (aes == NULL || out == NULL || in == NULL || iv == NULL || - authTag == NULL || authTagSz > AES_BLOCK_SIZE) { - return BAD_FUNC_ARG; - } - -#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES) - /* if async and byte count above threshold */ - if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES && - sz >= WC_ASYNC_THRESH_AES_GCM) { - #if defined(HAVE_CAVIUM) - /* Not yet supported, contact wolfSSL if interested in using */ - #elif defined(HAVE_INTEL_QA) - return IntelQaSymAesGcmDecrypt(&aes->asyncDev, out, in, sz, - (const byte*)aes->asyncKey, aes->keylen, iv, ivSz, - authTag, authTagSz, authIn, authInSz); - #else /* WOLFSSL_ASYNC_CRYPT_TEST */ - if (wc_AsyncTestInit(&aes->asyncDev, ASYNC_TEST_AES_GCM_DECRYPT)) { - WC_ASYNC_TEST* testDev = &aes->asyncDev.test; - testDev->aes.aes = aes; - testDev->aes.out = out; - testDev->aes.in = in; - testDev->aes.sz = sz; - testDev->aes.iv = iv; - testDev->aes.ivSz = ivSz; - testDev->aes.authTag = (byte*)authTag; - testDev->aes.authTagSz = authTagSz; - testDev->aes.authIn = authIn; - testDev->aes.authInSz = authInSz; - return WC_PENDING_E; - } - #endif - } -#endif /* WOLFSSL_ASYNC_CRYPT */ - - /* software AES GCM */ - -#ifdef WOLFSSL_AESNI - #ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_AVX2(intel_flags)) { - AES_GCM_decrypt_avx2(in, out, authIn, iv, authTag, sz, authInSz, - ivSz, (byte*)aes->key, aes->rounds, &res); - if (res == 0) - return AES_GCM_AUTH_E; - return 0; - } - else - #endif - #ifdef HAVE_INTEL_AVX1 - if (IS_INTEL_AVX1(intel_flags)) { - AES_GCM_decrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, - ivSz, (byte*)aes->key, aes->rounds, &res); - if (res == 0) - return AES_GCM_AUTH_E; - return 0; - } - else - #endif - if (haveAESNI) { - AES_GCM_decrypt(in, out, authIn, iv, authTag, sz, authInSz, ivSz, - (byte*)aes->key, aes->rounds, &res); - if (res == 0) - return AES_GCM_AUTH_E; - return 0; - } - else -#endif - { - return AES_GCM_decrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz, - authIn, authInSz); - } -} -#endif #endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */ #endif /* (WOLFSSL_XILINX_CRYPT) */ diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index d33777609..24469490f 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -5923,7 +5923,7 @@ int aesgcm_test(void) return -4309; #endif /* BENCH_AESGCM_LARGE */ -#ifdef ENABLE_NON_12BYTE_IV_TEST +#if !defined(HAVE_FIPS) && !defined(STM32_CRYPTO) /* Variable IV length test */ for (ivlen=0; ivlen<(int)sizeof(k1); ivlen++) { /* AES-GCM encrypt and decrypt both use AES encrypt internally */ @@ -5963,29 +5963,6 @@ int aesgcm_test(void) return -4313; } -#ifdef BENCH_AESGCM_LARGE - /* Variable plain text length test */ - for (plen=1; plen