Merge pull request #1064 from SparkiDev/cpuid_fix

Fix cpuid to work with different configs
This commit is contained in:
toddouska
2017-07-31 11:59:17 -07:00
committed by GitHub
3 changed files with 206 additions and 204 deletions

View File

@@ -1402,7 +1402,7 @@ static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
/* check alignment, decrypt doesn't need alignment */ /* check alignment, decrypt doesn't need alignment */
if ((wolfssl_word)inBlock % AESNI_ALIGN) { if ((wolfssl_word)inBlock % AESNI_ALIGN) {
#ifndef NO_WOLFSSL_ALLOC_ALIGN #ifndef NO_WOLFSSL_ALLOC_ALIGN
byte* tmp = (byte*)XMALLOC(AES_BLOCK_SIZE, aes->heap, byte* tmp = (byte*)XMALLOC(AES_BLOCK_SIZE + AESNI_ALIGN, aes->heap,
DYNAMIC_TYPE_TMP_BUFFER); DYNAMIC_TYPE_TMP_BUFFER);
byte* tmp_align; byte* tmp_align;
if (tmp == NULL) return; if (tmp == NULL) return;
@@ -2924,15 +2924,15 @@ int wc_AesSetIV(Aes* aes, const byte* iv)
/* check alignment, decrypt doesn't need alignment */ /* check alignment, decrypt doesn't need alignment */
if ((wolfssl_word)in % AESNI_ALIGN) { if ((wolfssl_word)in % AESNI_ALIGN) {
#ifndef NO_WOLFSSL_ALLOC_ALIGN #ifndef NO_WOLFSSL_ALLOC_ALIGN
byte* tmp = (byte*)XMALLOC(sz + AESNI_ALIGN, aes->heap, byte* tmp = (byte*)XMALLOC(sz + AES_BLOCK_SIZE + AESNI_ALIGN,
DYNAMIC_TYPE_TMP_BUFFER); aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
byte* tmp_align; byte* tmp_align;
if (tmp == NULL) return MEMORY_E; if (tmp == NULL) return MEMORY_E;
tmp_align = tmp + (AESNI_ALIGN - ((size_t)tmp % AESNI_ALIGN)); tmp_align = tmp + (AESNI_ALIGN - ((size_t)tmp % AESNI_ALIGN));
XMEMCPY(tmp_align, in, sz); XMEMCPY(tmp_align, in, sz);
AES_CBC_encrypt(tmp_align, tmp_align, (byte*)aes->reg, sz, (byte*)aes->key, AES_CBC_encrypt(tmp_align, tmp_align, (byte*)aes->reg, sz,
aes->rounds); (byte*)aes->key, aes->rounds);
/* store iv for next call */ /* store iv for next call */
XMEMCPY(aes->reg, tmp_align + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); XMEMCPY(aes->reg, tmp_align + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
@@ -5552,206 +5552,206 @@ static int AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
HT[5] = gfmul_shifted(HT[2], HT[2]); HT[5] = gfmul_shifted(HT[2], HT[2]);
HT[6] = gfmul_shifted(HT[2], HT[3]); HT[6] = gfmul_shifted(HT[2], HT[3]);
HT[7] = gfmul_shifted(HT[3], HT[3]); HT[7] = gfmul_shifted(HT[3], HT[3]);
}
for (; i < nbytes/16/8; i++) { for (; i < nbytes/16/8; i++) {
r0 = _mm_setzero_si128(); r0 = _mm_setzero_si128();
r1 = _mm_setzero_si128(); r1 = _mm_setzero_si128();
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
tmp2 = _mm_add_epi32(ctr1, ONE); tmp2 = _mm_add_epi32(ctr1, ONE);
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64);
tmp3 = _mm_add_epi32(ctr1, TWO); tmp3 = _mm_add_epi32(ctr1, TWO);
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64);
tmp4 = _mm_add_epi32(ctr1, THREE); tmp4 = _mm_add_epi32(ctr1, THREE);
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64);
tmp5 = _mm_add_epi32(ctr1, FOUR); tmp5 = _mm_add_epi32(ctr1, FOUR);
tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64);
tmp6 = _mm_add_epi32(ctr1, FIVE); tmp6 = _mm_add_epi32(ctr1, FIVE);
tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64);
tmp7 = _mm_add_epi32(ctr1, SIX); tmp7 = _mm_add_epi32(ctr1, SIX);
tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64);
tmp8 = _mm_add_epi32(ctr1, SEVEN); tmp8 = _mm_add_epi32(ctr1, SEVEN);
tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64);
ctr1 = _mm_add_epi32(ctr1, EIGHT); ctr1 = _mm_add_epi32(ctr1, EIGHT);
tmp1 =_mm_xor_si128(tmp1, KEY[0]); tmp1 =_mm_xor_si128(tmp1, KEY[0]);
tmp2 =_mm_xor_si128(tmp2, KEY[0]); tmp2 =_mm_xor_si128(tmp2, KEY[0]);
tmp3 =_mm_xor_si128(tmp3, KEY[0]); tmp3 =_mm_xor_si128(tmp3, KEY[0]);
tmp4 =_mm_xor_si128(tmp4, KEY[0]); tmp4 =_mm_xor_si128(tmp4, KEY[0]);
tmp5 =_mm_xor_si128(tmp5, KEY[0]); tmp5 =_mm_xor_si128(tmp5, KEY[0]);
tmp6 =_mm_xor_si128(tmp6, KEY[0]); tmp6 =_mm_xor_si128(tmp6, KEY[0]);
tmp7 =_mm_xor_si128(tmp7, KEY[0]); tmp7 =_mm_xor_si128(tmp7, KEY[0]);
tmp8 =_mm_xor_si128(tmp8, KEY[0]); tmp8 =_mm_xor_si128(tmp8, KEY[0]);
/* 128 x 128 Carryless Multiply */ /* 128 x 128 Carryless Multiply */
XV = _mm_loadu_si128(&((__m128i*)in)[i*8+0]); XV = _mm_loadu_si128(&((__m128i*)in)[i*8+0]);
XV = _mm_shuffle_epi8(XV, BSWAP_MASK); XV = _mm_shuffle_epi8(XV, BSWAP_MASK);
XV = _mm_xor_si128(XV, X); XV = _mm_xor_si128(XV, X);
gfmul_only(XV, HT[7], &r0, &r1); gfmul_only(XV, HT[7], &r0, &r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); tmp1 = _mm_aesenc_si128(tmp1, KEY[1]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[1]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); tmp3 = _mm_aesenc_si128(tmp3, KEY[1]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); tmp4 = _mm_aesenc_si128(tmp4, KEY[1]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); tmp5 = _mm_aesenc_si128(tmp5, KEY[1]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); tmp6 = _mm_aesenc_si128(tmp6, KEY[1]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); tmp7 = _mm_aesenc_si128(tmp7, KEY[1]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); tmp8 = _mm_aesenc_si128(tmp8, KEY[1]);
/* 128 x 128 Carryless Multiply */ /* 128 x 128 Carryless Multiply */
XV = _mm_loadu_si128(&((__m128i*)in)[i*8+1]); XV = _mm_loadu_si128(&((__m128i*)in)[i*8+1]);
XV = _mm_shuffle_epi8(XV, BSWAP_MASK); XV = _mm_shuffle_epi8(XV, BSWAP_MASK);
gfmul_only(XV, HT[6], &r0, &r1); gfmul_only(XV, HT[6], &r0, &r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); tmp1 = _mm_aesenc_si128(tmp1, KEY[2]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); tmp2 = _mm_aesenc_si128(tmp2, KEY[2]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); tmp3 = _mm_aesenc_si128(tmp3, KEY[2]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); tmp4 = _mm_aesenc_si128(tmp4, KEY[2]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); tmp5 = _mm_aesenc_si128(tmp5, KEY[2]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); tmp6 = _mm_aesenc_si128(tmp6, KEY[2]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); tmp7 = _mm_aesenc_si128(tmp7, KEY[2]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); tmp8 = _mm_aesenc_si128(tmp8, KEY[2]);
/* 128 x 128 Carryless Multiply */ /* 128 x 128 Carryless Multiply */
XV = _mm_loadu_si128(&((__m128i*)in)[i*8+2]); XV = _mm_loadu_si128(&((__m128i*)in)[i*8+2]);
XV = _mm_shuffle_epi8(XV, BSWAP_MASK); XV = _mm_shuffle_epi8(XV, BSWAP_MASK);
gfmul_only(XV, HT[5], &r0, &r1); gfmul_only(XV, HT[5], &r0, &r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); tmp1 = _mm_aesenc_si128(tmp1, KEY[3]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); tmp2 = _mm_aesenc_si128(tmp2, KEY[3]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); tmp3 = _mm_aesenc_si128(tmp3, KEY[3]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); tmp4 = _mm_aesenc_si128(tmp4, KEY[3]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); tmp5 = _mm_aesenc_si128(tmp5, KEY[3]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); tmp6 = _mm_aesenc_si128(tmp6, KEY[3]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); tmp7 = _mm_aesenc_si128(tmp7, KEY[3]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); tmp8 = _mm_aesenc_si128(tmp8, KEY[3]);
/* 128 x 128 Carryless Multiply */ /* 128 x 128 Carryless Multiply */
XV = _mm_loadu_si128(&((__m128i*)in)[i*8+3]); XV = _mm_loadu_si128(&((__m128i*)in)[i*8+3]);
XV = _mm_shuffle_epi8(XV, BSWAP_MASK); XV = _mm_shuffle_epi8(XV, BSWAP_MASK);
gfmul_only(XV, HT[4], &r0, &r1); gfmul_only(XV, HT[4], &r0, &r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); tmp1 = _mm_aesenc_si128(tmp1, KEY[4]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); tmp2 = _mm_aesenc_si128(tmp2, KEY[4]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); tmp3 = _mm_aesenc_si128(tmp3, KEY[4]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); tmp4 = _mm_aesenc_si128(tmp4, KEY[4]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); tmp5 = _mm_aesenc_si128(tmp5, KEY[4]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); tmp6 = _mm_aesenc_si128(tmp6, KEY[4]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); tmp7 = _mm_aesenc_si128(tmp7, KEY[4]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); tmp8 = _mm_aesenc_si128(tmp8, KEY[4]);
/* 128 x 128 Carryless Multiply */ /* 128 x 128 Carryless Multiply */
XV = _mm_loadu_si128(&((__m128i*)in)[i*8+4]); XV = _mm_loadu_si128(&((__m128i*)in)[i*8+4]);
XV = _mm_shuffle_epi8(XV, BSWAP_MASK); XV = _mm_shuffle_epi8(XV, BSWAP_MASK);
gfmul_only(XV, HT[3], &r0, &r1); gfmul_only(XV, HT[3], &r0, &r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); tmp1 = _mm_aesenc_si128(tmp1, KEY[5]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); tmp2 = _mm_aesenc_si128(tmp2, KEY[5]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); tmp3 = _mm_aesenc_si128(tmp3, KEY[5]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); tmp4 = _mm_aesenc_si128(tmp4, KEY[5]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); tmp5 = _mm_aesenc_si128(tmp5, KEY[5]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); tmp6 = _mm_aesenc_si128(tmp6, KEY[5]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); tmp7 = _mm_aesenc_si128(tmp7, KEY[5]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); tmp8 = _mm_aesenc_si128(tmp8, KEY[5]);
/* 128 x 128 Carryless Multiply */ /* 128 x 128 Carryless Multiply */
XV = _mm_loadu_si128(&((__m128i*)in)[i*8+5]); XV = _mm_loadu_si128(&((__m128i*)in)[i*8+5]);
XV = _mm_shuffle_epi8(XV, BSWAP_MASK); XV = _mm_shuffle_epi8(XV, BSWAP_MASK);
gfmul_only(XV, HT[2], &r0, &r1); gfmul_only(XV, HT[2], &r0, &r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); tmp1 = _mm_aesenc_si128(tmp1, KEY[6]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); tmp2 = _mm_aesenc_si128(tmp2, KEY[6]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); tmp3 = _mm_aesenc_si128(tmp3, KEY[6]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); tmp4 = _mm_aesenc_si128(tmp4, KEY[6]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); tmp5 = _mm_aesenc_si128(tmp5, KEY[6]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); tmp6 = _mm_aesenc_si128(tmp6, KEY[6]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); tmp7 = _mm_aesenc_si128(tmp7, KEY[6]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); tmp8 = _mm_aesenc_si128(tmp8, KEY[6]);
/* 128 x 128 Carryless Multiply */ /* 128 x 128 Carryless Multiply */
XV = _mm_loadu_si128(&((__m128i*)in)[i*8+6]); XV = _mm_loadu_si128(&((__m128i*)in)[i*8+6]);
XV = _mm_shuffle_epi8(XV, BSWAP_MASK); XV = _mm_shuffle_epi8(XV, BSWAP_MASK);
gfmul_only(XV, HT[1], &r0, &r1); gfmul_only(XV, HT[1], &r0, &r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); tmp1 = _mm_aesenc_si128(tmp1, KEY[7]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); tmp2 = _mm_aesenc_si128(tmp2, KEY[7]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); tmp3 = _mm_aesenc_si128(tmp3, KEY[7]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); tmp4 = _mm_aesenc_si128(tmp4, KEY[7]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); tmp5 = _mm_aesenc_si128(tmp5, KEY[7]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); tmp6 = _mm_aesenc_si128(tmp6, KEY[7]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); tmp7 = _mm_aesenc_si128(tmp7, KEY[7]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); tmp8 = _mm_aesenc_si128(tmp8, KEY[7]);
/* 128 x 128 Carryless Multiply */ /* 128 x 128 Carryless Multiply */
XV = _mm_loadu_si128(&((__m128i*)in)[i*8+7]); XV = _mm_loadu_si128(&((__m128i*)in)[i*8+7]);
XV = _mm_shuffle_epi8(XV, BSWAP_MASK); XV = _mm_shuffle_epi8(XV, BSWAP_MASK);
gfmul_only(XV, HT[0], &r0, &r1); gfmul_only(XV, HT[0], &r0, &r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); tmp1 = _mm_aesenc_si128(tmp1, KEY[8]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); tmp2 = _mm_aesenc_si128(tmp2, KEY[8]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); tmp3 = _mm_aesenc_si128(tmp3, KEY[8]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); tmp4 = _mm_aesenc_si128(tmp4, KEY[8]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); tmp5 = _mm_aesenc_si128(tmp5, KEY[8]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); tmp6 = _mm_aesenc_si128(tmp6, KEY[8]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); tmp7 = _mm_aesenc_si128(tmp7, KEY[8]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); tmp8 = _mm_aesenc_si128(tmp8, KEY[8]);
/* Reduction */ /* Reduction */
X = ghash_red(r0, r1); X = ghash_red(r0, r1);
tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); tmp1 = _mm_aesenc_si128(tmp1, KEY[9]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); tmp2 = _mm_aesenc_si128(tmp2, KEY[9]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); tmp3 = _mm_aesenc_si128(tmp3, KEY[9]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); tmp4 = _mm_aesenc_si128(tmp4, KEY[9]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); tmp5 = _mm_aesenc_si128(tmp5, KEY[9]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); tmp6 = _mm_aesenc_si128(tmp6, KEY[9]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); tmp7 = _mm_aesenc_si128(tmp7, KEY[9]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); tmp8 = _mm_aesenc_si128(tmp8, KEY[9]);
lastKey = KEY[10]; lastKey = KEY[10];
if (nr > 10) { if (nr > 10) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); tmp1 = _mm_aesenc_si128(tmp1, KEY[10]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); tmp2 = _mm_aesenc_si128(tmp2, KEY[10]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); tmp3 = _mm_aesenc_si128(tmp3, KEY[10]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); tmp4 = _mm_aesenc_si128(tmp4, KEY[10]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); tmp5 = _mm_aesenc_si128(tmp5, KEY[10]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); tmp6 = _mm_aesenc_si128(tmp6, KEY[10]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); tmp7 = _mm_aesenc_si128(tmp7, KEY[10]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); tmp8 = _mm_aesenc_si128(tmp8, KEY[10]);
tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); tmp1 = _mm_aesenc_si128(tmp1, KEY[11]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); tmp2 = _mm_aesenc_si128(tmp2, KEY[11]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); tmp3 = _mm_aesenc_si128(tmp3, KEY[11]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); tmp4 = _mm_aesenc_si128(tmp4, KEY[11]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); tmp5 = _mm_aesenc_si128(tmp5, KEY[11]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); tmp6 = _mm_aesenc_si128(tmp6, KEY[11]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); tmp7 = _mm_aesenc_si128(tmp7, KEY[11]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); tmp8 = _mm_aesenc_si128(tmp8, KEY[11]);
lastKey = KEY[12]; lastKey = KEY[12];
if (nr > 12) { if (nr > 12) {
tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); tmp1 = _mm_aesenc_si128(tmp1, KEY[12]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); tmp2 = _mm_aesenc_si128(tmp2, KEY[12]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); tmp3 = _mm_aesenc_si128(tmp3, KEY[12]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); tmp4 = _mm_aesenc_si128(tmp4, KEY[12]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); tmp5 = _mm_aesenc_si128(tmp5, KEY[12]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); tmp6 = _mm_aesenc_si128(tmp6, KEY[12]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); tmp7 = _mm_aesenc_si128(tmp7, KEY[12]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); tmp8 = _mm_aesenc_si128(tmp8, KEY[12]);
tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); tmp1 = _mm_aesenc_si128(tmp1, KEY[13]);
tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); tmp2 = _mm_aesenc_si128(tmp2, KEY[13]);
tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); tmp3 = _mm_aesenc_si128(tmp3, KEY[13]);
tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); tmp4 = _mm_aesenc_si128(tmp4, KEY[13]);
tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); tmp5 = _mm_aesenc_si128(tmp5, KEY[13]);
tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); tmp6 = _mm_aesenc_si128(tmp6, KEY[13]);
tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); tmp7 = _mm_aesenc_si128(tmp7, KEY[13]);
tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); tmp8 = _mm_aesenc_si128(tmp8, KEY[13]);
lastKey = KEY[14]; lastKey = KEY[14];
}
} }
tmp1 =_mm_aesenclast_si128(tmp1, lastKey);
tmp2 =_mm_aesenclast_si128(tmp2, lastKey);
tmp3 =_mm_aesenclast_si128(tmp3, lastKey);
tmp4 =_mm_aesenclast_si128(tmp4, lastKey);
tmp5 =_mm_aesenclast_si128(tmp5, lastKey);
tmp6 =_mm_aesenclast_si128(tmp6, lastKey);
tmp7 =_mm_aesenclast_si128(tmp7, lastKey);
tmp8 =_mm_aesenclast_si128(tmp8, lastKey);
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0]));
tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1]));
tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2]));
tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3]));
tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4]));
tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5]));
tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6]));
tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7]));
_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
} }
tmp1 =_mm_aesenclast_si128(tmp1, lastKey);
tmp2 =_mm_aesenclast_si128(tmp2, lastKey);
tmp3 =_mm_aesenclast_si128(tmp3, lastKey);
tmp4 =_mm_aesenclast_si128(tmp4, lastKey);
tmp5 =_mm_aesenclast_si128(tmp5, lastKey);
tmp6 =_mm_aesenclast_si128(tmp6, lastKey);
tmp7 =_mm_aesenclast_si128(tmp7, lastKey);
tmp8 =_mm_aesenclast_si128(tmp8, lastKey);
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0]));
tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1]));
tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2]));
tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3]));
tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4]));
tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5]));
tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6]));
tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7]));
_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
} }
#endif #endif
for (k = i*8; k < nbytes/16; k++) { for (k = i*8; k < nbytes/16; k++) {

View File

@@ -28,7 +28,8 @@
#include <wolfssl/wolfcrypt/cpuid.h> #include <wolfssl/wolfcrypt/cpuid.h>
#ifdef WOLFSSL_X86_64_BUILD #if defined(WOLFSSL_X86_64_BUILD) || defined(USE_INTEL_SPEEDUP) || \
defined(WOLFSSL_AESNI)
/* Each platform needs to query info type 1 from cpuid to see if aesni is /* Each platform needs to query info type 1 from cpuid to see if aesni is
* supported. Also, let's setup a macro for proper linkage w/o ABI conflicts * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
*/ */

View File

@@ -32,7 +32,8 @@
extern "C" { extern "C" {
#endif #endif
#ifdef WOLFSSL_X86_64_BUILD #if defined(WOLFSSL_X86_64_BUILD) || defined(USE_INTEL_SPEEDUP) || \
defined(WOLFSSL_AESNI)
#define CPUID_AVX1 0x0001 #define CPUID_AVX1 0x0001
#define CPUID_AVX2 0x0002 #define CPUID_AVX2 0x0002
#define CPUID_RDRAND 0x0004 #define CPUID_RDRAND 0x0004