From b75dc127f3ba330ad3926e9d9d1352453a3f37b5 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Fri, 8 Apr 2016 11:53:40 -0700 Subject: [PATCH 01/12] 1. Attempting to perform 8 AES-CBC decrypt operations simultaneously. 2. Added code to test large AES-CBC decrypts. --- wolfcrypt/src/aes.c | 12 ++ wolfcrypt/src/aes_asm.s | 250 ++++++++++++++++++++++++++++++++++++++++ wolfcrypt/test/test.c | 83 +++++++++++++ 3 files changed, 345 insertions(+) diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 6d6eae21e..a3c98370b 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -1094,10 +1094,17 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, XASM_LINK("AES_CBC_encrypt"); #ifdef HAVE_AES_DECRYPT +#ifndef HAVE_AES_DECRYPT_EX void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned char* ivec, unsigned long length, const unsigned char* KS, int nr) XASM_LINK("AES_CBC_decrypt"); +#else /* HAVE_AES_DECRYPT_EX */ +void AES_CBC_decrypt_ex(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_ex"); +#endif /* HAVE_AES_DECRYPT_EX */ #endif /* HAVE_AES_DECRYPT */ #endif /* HAVE_AES_CBC */ @@ -2549,8 +2556,13 @@ int wc_AesSetIV(Aes* aes, const byte* iv) /* if input and output same will overwrite input iv */ XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); +#ifndef HAVE_AES_DECRYPT_EX AES_CBC_decrypt(in, out, (byte*)aes->reg, sz, (byte*)aes->key, aes->rounds); +#else /* HAVE_AES_DECRYPT_EX */ + AES_CBC_decrypt_ex(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + aes->rounds); +#endif /* HAVE_AES_DECRYPT_EX */ /* store iv for next call */ XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE); return 0; diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s index 385a2d49a..c9ae40a18 100644 --- a/wolfcrypt/src/aes_asm.s +++ b/wolfcrypt/src/aes_asm.s @@ -266,6 +266,256 @@ DEND_4: ret +/* +AES_CBC_decrypt_ex (const unsigned char *in, + unsigned char *out, + unsigned char ivec[16], + unsigned long length, + const unsigned char *KS, + int nr) +*/ +.globl AES_CBC_decrypt_ex +AES_CBC_decrypt_ex: +# parameter 1: %rdi - in +# parameter 2: %rsi - out +# parameter 3: %rdx - ivec +# parameter 4: %rcx - length +# parameter 5: %r8 - KS +# parameter 6: %r9d - nr + + movq %rcx, %r10 + shrq $4, %rcx + shlq $60, %r10 + je ENO_PARTS_8 + addq $1, %rcx +ENO_PARTS_8: + movq %rcx, %r10 + shlq $61, %r10 + shrq $61, %r10 + shrq $3, %rcx + movdqu (%rdx), %xmm9 + je EREMAINDER_8 + subq $128, %rsi +ELOOP_8: + movdqu (%rdi), %xmm1 + movdqu 16(%rdi), %xmm2 + movdqu 32(%rdi), %xmm3 + movdqu 48(%rdi), %xmm4 + movdqu 64(%rdi), %xmm5 + movdqu 80(%rdi), %xmm6 + movdqu 96(%rdi), %xmm7 + movdqu 112(%rdi), %xmm8 + movdqa (%r8), %xmm10 + movdqa 16(%r8), %xmm11 + movdqa 32(%r8), %xmm12 + movdqa 48(%r8), %xmm13 + pxor %xmm10, %xmm1 + pxor %xmm10, %xmm2 + pxor %xmm10, %xmm3 + pxor %xmm10, %xmm4 + pxor %xmm10, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm10, %xmm7 + pxor %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm3 + aesdec %xmm12, %xmm4 + aesdec %xmm12, %xmm5 + aesdec %xmm12, %xmm6 + aesdec %xmm12, %xmm7 + aesdec %xmm12, %xmm8 + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm3 + aesdec %xmm13, %xmm4 + aesdec %xmm13, %xmm5 + aesdec %xmm13, %xmm6 + aesdec %xmm13, %xmm7 + aesdec %xmm13, %xmm8 + movdqa 64(%r8), %xmm10 + movdqa 80(%r8), %xmm11 + movdqa 96(%r8), %xmm12 + movdqa 112(%r8), %xmm13 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm10, %xmm7 + aesdec %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm3 + aesdec %xmm12, %xmm4 + aesdec %xmm12, %xmm5 + aesdec %xmm12, %xmm6 + aesdec %xmm12, %xmm7 + aesdec %xmm12, %xmm8 + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm3 + aesdec %xmm13, %xmm4 + aesdec %xmm13, %xmm5 + aesdec %xmm13, %xmm6 + aesdec %xmm13, %xmm7 + aesdec %xmm13, %xmm8 + movdqa 128(%r8), %xmm10 + movdqa 144(%r8), %xmm11 + movdqa 160(%r8), %xmm12 + cmpl $12, %r9d + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm10, %xmm7 + aesdec %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 + jb ELAST_8 + movdqa 160(%r8), %xmm10 + movdqa 176(%r8), %xmm11 + movdqa 192(%r8), %xmm12 + cmpl $14, %r9d + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm10, %xmm7 + aesdec %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 + jb ELAST_8 + movdqa 192(%r8), %xmm10 + movdqa 208(%r8), %xmm11 + movdqa 224(%r8), %xmm12 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm10, %xmm7 + aesdec %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 +ELAST_8: + addq $128, %rsi + aesdeclast %xmm12, %xmm1 + aesdeclast %xmm12, %xmm2 + aesdeclast %xmm12, %xmm3 + aesdeclast %xmm12, %xmm4 + aesdeclast %xmm12, %xmm5 + aesdeclast %xmm12, %xmm6 + aesdeclast %xmm12, %xmm7 + aesdeclast %xmm12, %xmm8 + movdqu (%rdi), %xmm10 + movdqu 16(%rdi), %xmm11 + movdqu 32(%rdi), %xmm12 + movdqu 48(%rdi), %xmm13 + pxor %xmm9, %xmm1 + pxor %xmm10, %xmm2 + pxor %xmm11, %xmm3 + pxor %xmm12, %xmm4 + pxor %xmm13, %xmm5 + movdqu 64(%rdi), %xmm10 + movdqu 80(%rdi), %xmm11 + movdqu 96(%rdi), %xmm12 + movdqu 112(%rdi), %xmm9 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + pxor %xmm12, %xmm8 + movdqu %xmm1, (%rsi) + movdqu %xmm2, 16(%rsi) + movdqu %xmm3, 32(%rsi) + movdqu %xmm4, 48(%rsi) + movdqu %xmm5, 64(%rsi) + movdqu %xmm6, 80(%rsi) + movdqu %xmm7, 96(%rsi) + movdqu %xmm8, 112(%rsi) + addq $128, %rdi + decq %rcx + jne ELOOP_8 + addq $128, %rsi +EREMAINDER_8: + cmpq $0, %r10 + je EEND_8 +ELOOP_8_2: + movdqu (%rdi), %xmm1 + movdqa %xmm1 ,%xmm10 + addq $16, %rdi + pxor (%r8), %xmm1 + movdqu 160(%r8), %xmm2 + cmpl $12, %r9d + aesdec 16(%r8), %xmm1 + aesdec 32(%r8), %xmm1 + aesdec 48(%r8), %xmm1 + aesdec 64(%r8), %xmm1 + aesdec 80(%r8), %xmm1 + aesdec 96(%r8), %xmm1 + aesdec 112(%r8), %xmm1 + aesdec 128(%r8), %xmm1 + aesdec 144(%r8), %xmm1 + jb ELAST_8_2 + movdqu 192(%r8), %xmm2 + cmpl $14, %r9d + aesdec 160(%r8), %xmm1 + aesdec 176(%r8), %xmm1 + jb ELAST_8_2 + movdqu 224(%r8), %xmm2 + aesdec 192(%r8), %xmm1 + aesdec 208(%r8), %xmm1 +ELAST_8_2: + aesdeclast %xmm2, %xmm1 + pxor %xmm9, %xmm1 + movdqa %xmm10, %xmm9 + movdqu %xmm1, (%rsi) + addq $16, %rsi + decq %r10 + jne ELOOP_8_2 +EEND_8: + ret + + /* AES_ECB_encrypt (const unsigned char *in, unsigned char *out, diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 741d4da36..c55dc3921 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -2669,6 +2669,89 @@ int aes_test(void) if (memcmp(cipher, verify, AES_BLOCK_SIZE)) return -61; +#if defined(WOLFSSL_AESNI) && \ + defined(HAVE_AES_DECRYPT) && defined(HAVE_AES_DECRYPT_EX) + { + const byte bigMsg[] = { + /* "All work and no play makes Jack a dull boy. " */ + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20, + 0x61,0x20,0x64,0x75,0x6c,0x6c,0x20,0x62, + 0x6f,0x79,0x2e,0x20,0x41,0x6c,0x6c,0x20, + 0x77,0x6f,0x72,0x6b,0x20,0x61,0x6e,0x64, + 0x20,0x6e,0x6f,0x20,0x70,0x6c,0x61,0x79, + 0x20,0x6d,0x61,0x6b,0x65,0x73,0x20,0x4a, + 0x61,0x63,0x6b,0x20,0x61,0x20,0x64,0x75, + 0x6c,0x6c,0x20,0x62,0x6f,0x79,0x2e,0x20, + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20, + 0x61,0x20,0x64,0x75,0x6c,0x6c,0x20,0x62, + 0x6f,0x79,0x2e,0x20,0x41,0x6c,0x6c,0x20, + 0x77,0x6f,0x72,0x6b,0x20,0x61,0x6e,0x64, + 0x20,0x6e,0x6f,0x20,0x70,0x6c,0x61,0x79, + 0x20,0x6d,0x61,0x6b,0x65,0x73,0x20,0x4a, + 0x61,0x63,0x6b,0x20,0x61,0x20,0x64,0x75, + 0x6c,0x6c,0x20,0x62,0x6f,0x79,0x2e,0x20, + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20, + 0x61,0x20,0x64,0x75,0x6c,0x6c,0x20,0x62, + 0x6f,0x79,0x2e,0x20,0x41,0x6c,0x6c,0x20, + 0x77,0x6f,0x72,0x6b,0x20,0x61,0x6e,0x64, + 0x20,0x6e,0x6f,0x20,0x70,0x6c,0x61,0x79, + 0x20,0x6d,0x61,0x6b,0x65,0x73,0x20,0x4a, + 0x61,0x63,0x6b,0x20,0x61,0x20,0x64,0x75, + 0x6c,0x6c,0x20,0x62,0x6f,0x79,0x2e,0x20, + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20, + 0x61,0x20,0x64,0x75,0x6c,0x6c,0x20,0x62, + 0x6f,0x79,0x2e,0x20,0x41,0x6c,0x6c,0x20, + 0x77,0x6f,0x72,0x6b,0x20,0x61,0x6e,0x64, + 0x20,0x6e,0x6f,0x20,0x70,0x6c,0x61,0x79, + 0x20,0x6d,0x61,0x6b,0x65,0x73,0x20,0x4a, + 0x61,0x63,0x6b,0x20,0x61,0x20,0x64,0x75, + 0x6c,0x6c,0x20,0x62,0x6f,0x79,0x2e,0x20, + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20 + }; + byte bigCipher[sizeof(bigMsg)]; + byte bigPlain[sizeof(bigMsg)]; + + ret = wc_AesSetKey(&enc, key, AES_BLOCK_SIZE, iv, AES_ENCRYPTION); + if (ret != 0) + return -1030; + ret = wc_AesSetKey(&dec, key, AES_BLOCK_SIZE, iv, AES_DECRYPTION); + if (ret != 0) + return -1031; + + #define AESNI_DECRYPT_SIZE (AES_BLOCK_SIZE*24) + + if ((sizeof(bigMsg) < AESNI_DECRYPT_SIZE) || + (AESNI_DECRYPT_SIZE == 0) || + (AESNI_DECRYPT_SIZE % AES_BLOCK_SIZE != 0)) + return -1032; + + ret = wc_AesCbcEncrypt(&enc, bigCipher, bigMsg, AESNI_DECRYPT_SIZE); + if (ret != 0) + return -1033; + ret = wc_AesCbcDecrypt(&dec, bigPlain, bigCipher, AESNI_DECRYPT_SIZE); + if (ret != 0) + return -1034; + + if (memcmp(bigPlain, bigMsg, AESNI_DECRYPT_SIZE)) + return -1035; + } +#endif /* WOLFSSL_AESNI HAVE_AES_DECRYPT HAVE_AES_DECRYPT_EX */ + #ifdef HAVE_CAVIUM wc_AesFreeCavium(&enc); wc_AesFreeCavium(&dec); From 698b1cc7dc26c0cf3b764859cfb866874449efc5 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Fri, 8 Apr 2016 13:33:41 -0700 Subject: [PATCH 02/12] update benchmark to show AES-CBC decrypt speed --- wolfcrypt/benchmark/benchmark.c | 36 ++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index fbb79c3e7..76cd20f71 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -525,7 +525,41 @@ void bench_aes(int show) #endif if (show) { - printf("AES %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, + printf("AES enc %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, + blockType, total, persec); + SHOW_INTEL_CYCLES + printf("\n"); + } +#ifdef HAVE_CAVIUM + wc_AesFreeCavium(&enc); + if (wc_AesInitCavium(&enc, CAVIUM_DEV_ID) != 0) { + printf("aes init cavium failed\n"); + return; + } +#endif + + ret = wc_AesSetKey(&enc, key, 16, iv, AES_DECRYPTION); + if (ret != 0) { + printf("AesSetKey failed, ret = %d\n", ret); + return; + } + start = current_time(1); + BEGIN_INTEL_CYCLES + + for(i = 0; i < numBlocks; i++) + wc_AesCbcDecrypt(&enc, plain, cipher, sizeof(plain)); + + END_INTEL_CYCLES + total = current_time(0) - start; + + persec = 1 / total * numBlocks; +#ifdef BENCH_EMBEDDED + /* since using kB, convert to MB/s */ + persec = persec / 1024; +#endif + + if (show) { + printf("AES dec %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, blockType, total, persec); SHOW_INTEL_CYCLES printf("\n"); From 8524afc56a26c1bb6ca4e77a0aabfd472c52578e Mon Sep 17 00:00:00 2001 From: John Safranek Date: Tue, 12 Apr 2016 10:10:55 -0700 Subject: [PATCH 03/12] 1. Rename routine AES_CBC_decrypt_ex as AES_CBC_decrypt_by8 2. Added routine AES_CBC_decrypt_by6 that does six at a time. 3. Setting HAVE_AES_DECRYPT_BY6 or _BY8 (or not setting it) selects the 6, 8, or 4 way version of the assembly routine. 4. Modified AES-NI decrypt test to loop checking against the test bolus from 1 AES block to the whole 24 blocks. --- wolfcrypt/src/aes.c | 24 ++--- wolfcrypt/src/aes_asm.s | 228 +++++++++++++++++++++++++++++++++++++++- wolfcrypt/test/test.c | 45 ++++---- 3 files changed, 258 insertions(+), 39 deletions(-) diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index a3c98370b..3ce0fbcde 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -1094,17 +1094,22 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, XASM_LINK("AES_CBC_encrypt"); #ifdef HAVE_AES_DECRYPT -#ifndef HAVE_AES_DECRYPT_EX +#if defined(HAVE_AES_DECRYPT_BY8) +void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_by8"); +#elif defined(HAVE_AES_DECRYPT_BY6) +void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_by6"); +#else void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned char* ivec, unsigned long length, const unsigned char* KS, int nr) XASM_LINK("AES_CBC_decrypt"); -#else /* HAVE_AES_DECRYPT_EX */ -void AES_CBC_decrypt_ex(const unsigned char* in, unsigned char* out, - unsigned char* ivec, unsigned long length, - const unsigned char* KS, int nr) - XASM_LINK("AES_CBC_decrypt_ex"); -#endif /* HAVE_AES_DECRYPT_EX */ +#endif /* HAVE_AES_DECRYPT_BYX */ #endif /* HAVE_AES_DECRYPT */ #endif /* HAVE_AES_CBC */ @@ -2556,13 +2561,8 @@ int wc_AesSetIV(Aes* aes, const byte* iv) /* if input and output same will overwrite input iv */ XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); -#ifndef HAVE_AES_DECRYPT_EX AES_CBC_decrypt(in, out, (byte*)aes->reg, sz, (byte*)aes->key, aes->rounds); -#else /* HAVE_AES_DECRYPT_EX */ - AES_CBC_decrypt_ex(in, out, (byte*)aes->reg, sz, (byte*)aes->key, - aes->rounds); -#endif /* HAVE_AES_DECRYPT_EX */ /* store iv for next call */ XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE); return 0; diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s index c9ae40a18..3120aef22 100644 --- a/wolfcrypt/src/aes_asm.s +++ b/wolfcrypt/src/aes_asm.s @@ -267,15 +267,237 @@ ret /* -AES_CBC_decrypt_ex (const unsigned char *in, +AES_CBC_decrypt_by6 (const unsigned char *in, unsigned char *out, unsigned char ivec[16], unsigned long length, const unsigned char *KS, int nr) */ -.globl AES_CBC_decrypt_ex -AES_CBC_decrypt_ex: +.globl AES_CBC_decrypt_by6 +AES_CBC_decrypt_by6: +# parameter 1: %rdi - in +# parameter 2: %rsi - out +# parameter 3: %rdx - ivec +# parameter 4: %rcx - length +# parameter 5: %r8 - KS +# parameter 6: %r9d - nr + + movq %rcx, %r10 + shrq $4, %rcx + shlq $60, %r10 + je ENO_PARTS_6 + addq $1, %rcx +ENO_PARTS_6: + movq %rax, %r12 + movq %rdx, %r13 + movq %rbx, %r14 + movq $0, %rdx + movq %rcx, %rax + movq $6, %rbx + div %rbx + movq %rax, %rcx + movq %rdx, %r10 + movq %r12, %rax + movq %r13, %rdx + movq %r14, %rbx + cmpq $0, %rcx + movdqu (%rdx), %xmm7 + je EREMAINDER_6 + subq $96, %rsi +ELOOP_6: + movdqu (%rdi), %xmm1 + movdqu 16(%rdi), %xmm2 + movdqu 32(%rdi), %xmm3 + movdqu 48(%rdi), %xmm4 + movdqu 64(%rdi), %xmm5 + movdqu 80(%rdi), %xmm6 + movdqa (%r8), %xmm8 + movdqa 16(%r8), %xmm9 + movdqa 32(%r8), %xmm10 + movdqa 48(%r8), %xmm11 + pxor %xmm8, %xmm1 + pxor %xmm8, %xmm2 + pxor %xmm8, %xmm3 + pxor %xmm8, %xmm4 + pxor %xmm8, %xmm5 + pxor %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + movdqa 64(%r8), %xmm8 + movdqa 80(%r8), %xmm9 + movdqa 96(%r8), %xmm10 + movdqa 112(%r8), %xmm11 + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm3 + aesdec %xmm8, %xmm4 + aesdec %xmm8, %xmm5 + aesdec %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + movdqa 128(%r8), %xmm8 + movdqa 144(%r8), %xmm9 + movdqa 160(%r8), %xmm10 + cmpl $12, %r9d + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm3 + aesdec %xmm8, %xmm4 + aesdec %xmm8, %xmm5 + aesdec %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 + jb ELAST_6 + movdqa 160(%r8), %xmm8 + movdqa 176(%r8), %xmm9 + movdqa 192(%r8), %xmm10 + cmpl $14, %r9d + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm3 + aesdec %xmm8, %xmm4 + aesdec %xmm8, %xmm5 + aesdec %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 + jb ELAST_6 + movdqa 192(%r8), %xmm8 + movdqa 208(%r8), %xmm9 + movdqa 224(%r8), %xmm10 + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm3 + aesdec %xmm8, %xmm4 + aesdec %xmm8, %xmm5 + aesdec %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 +ELAST_6: + addq $96, %rsi + aesdeclast %xmm10, %xmm1 + aesdeclast %xmm10, %xmm2 + aesdeclast %xmm10, %xmm3 + aesdeclast %xmm10, %xmm4 + aesdeclast %xmm10, %xmm5 + aesdeclast %xmm10, %xmm6 + movdqu (%rdi), %xmm8 + movdqu 16(%rdi), %xmm9 + movdqu 32(%rdi), %xmm10 + movdqu 48(%rdi), %xmm11 + movdqu 64(%rdi), %xmm12 + movdqu 80(%rdi), %xmm13 + pxor %xmm7, %xmm1 + pxor %xmm8, %xmm2 + pxor %xmm9, %xmm3 + pxor %xmm10, %xmm4 + pxor %xmm11, %xmm5 + pxor %xmm12, %xmm6 + movdqu %xmm13, %xmm7 + movdqu %xmm1, (%rsi) + movdqu %xmm2, 16(%rsi) + movdqu %xmm3, 32(%rsi) + movdqu %xmm4, 48(%rsi) + movdqu %xmm5, 64(%rsi) + movdqu %xmm6, 80(%rsi) + addq $96, %rdi + decq %rcx + jne ELOOP_6 + addq $96, %rsi +EREMAINDER_6: + cmpq $0, %r10 + je EEND_6 +ELOOP_6_2: + movdqu (%rdi), %xmm1 + movdqa %xmm1 ,%xmm10 + addq $16, %rdi + pxor (%r8), %xmm1 + movdqu 160(%r8), %xmm2 + cmpl $12, %r9d + aesdec 16(%r8), %xmm1 + aesdec 32(%r8), %xmm1 + aesdec 48(%r8), %xmm1 + aesdec 64(%r8), %xmm1 + aesdec 80(%r8), %xmm1 + aesdec 96(%r8), %xmm1 + aesdec 112(%r8), %xmm1 + aesdec 128(%r8), %xmm1 + aesdec 144(%r8), %xmm1 + jb ELAST_6_2 + movdqu 192(%r8), %xmm2 + cmpl $14, %r9d + aesdec 160(%r8), %xmm1 + aesdec 176(%r8), %xmm1 + jb ELAST_6_2 + movdqu 224(%r8), %xmm2 + aesdec 192(%r8), %xmm1 + aesdec 208(%r8), %xmm1 +ELAST_6_2: + aesdeclast %xmm2, %xmm1 + pxor %xmm7, %xmm1 + movdqa %xmm10, %xmm7 + movdqu %xmm1, (%rsi) + addq $16, %rsi + decq %r10 + jne ELOOP_6_2 +EEND_6: + ret + + +/* +AES_CBC_decrypt_by8 (const unsigned char *in, + unsigned char *out, + unsigned char ivec[16], + unsigned long length, + const unsigned char *KS, + int nr) +*/ +.globl AES_CBC_decrypt_by8 +AES_CBC_decrypt_by8: # parameter 1: %rdi - in # parameter 2: %rsi - out # parameter 3: %rdx - ivec diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index c55dc3921..6d97aa0e1 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -2669,8 +2669,7 @@ int aes_test(void) if (memcmp(cipher, verify, AES_BLOCK_SIZE)) return -61; -#if defined(WOLFSSL_AESNI) && \ - defined(HAVE_AES_DECRYPT) && defined(HAVE_AES_DECRYPT_EX) +#if defined(WOLFSSL_AESNI) && defined(HAVE_AES_DECRYPT) { const byte bigMsg[] = { /* "All work and no play makes Jack a dull boy. " */ @@ -2725,32 +2724,30 @@ int aes_test(void) }; byte bigCipher[sizeof(bigMsg)]; byte bigPlain[sizeof(bigMsg)]; + word32 i; - ret = wc_AesSetKey(&enc, key, AES_BLOCK_SIZE, iv, AES_ENCRYPTION); - if (ret != 0) - return -1030; - ret = wc_AesSetKey(&dec, key, AES_BLOCK_SIZE, iv, AES_DECRYPTION); - if (ret != 0) - return -1031; + for (i = AES_BLOCK_SIZE; i <= sizeof(bigMsg); i += AES_BLOCK_SIZE) { + memset(bigCipher, 0, sizeof(bigCipher)); + memset(bigPlain, 0, sizeof(bigPlain)); + ret = wc_AesSetKey(&enc, key, AES_BLOCK_SIZE, iv, AES_ENCRYPTION); + if (ret != 0) + return -1030; + ret = wc_AesSetKey(&dec, key, AES_BLOCK_SIZE, iv, AES_DECRYPTION); + if (ret != 0) + return -1031; - #define AESNI_DECRYPT_SIZE (AES_BLOCK_SIZE*24) + ret = wc_AesCbcEncrypt(&enc, bigCipher, bigMsg, i); + if (ret != 0) + return -1032; + ret = wc_AesCbcDecrypt(&dec, bigPlain, bigCipher, i); + if (ret != 0) + return -1033; - if ((sizeof(bigMsg) < AESNI_DECRYPT_SIZE) || - (AESNI_DECRYPT_SIZE == 0) || - (AESNI_DECRYPT_SIZE % AES_BLOCK_SIZE != 0)) - return -1032; - - ret = wc_AesCbcEncrypt(&enc, bigCipher, bigMsg, AESNI_DECRYPT_SIZE); - if (ret != 0) - return -1033; - ret = wc_AesCbcDecrypt(&dec, bigPlain, bigCipher, AESNI_DECRYPT_SIZE); - if (ret != 0) - return -1034; - - if (memcmp(bigPlain, bigMsg, AESNI_DECRYPT_SIZE)) - return -1035; + if (memcmp(bigPlain, bigMsg, i)) + return -1034; + } } -#endif /* WOLFSSL_AESNI HAVE_AES_DECRYPT HAVE_AES_DECRYPT_EX */ +#endif /* WOLFSSL_AESNI HAVE_AES_DECRYPT */ #ifdef HAVE_CAVIUM wc_AesFreeCavium(&enc); From 451fd878f9bab9c509fafe6f5fb5c6b635eabc78 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Wed, 13 Apr 2016 11:48:25 -0700 Subject: [PATCH 04/12] touching whitespace in assembly routines being touched right now --- wolfcrypt/src/aes_asm.asm | 400 +++++++++++++++++++------------------- wolfcrypt/src/aes_asm.s | 305 ++++++++++++++--------------- 2 files changed, 347 insertions(+), 358 deletions(-) diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 5453d2e45..345229037 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -101,221 +101,213 @@ LAST: AES_CBC_encrypt ENDP - -; /* -; AES_CBC_decrypt[const ,unsigned char*in -; unsigned ,char*out -; unsigned ,char ivec+16 -; unsigned ,long length -; const ,unsigned char*KS -; int nr] -; */ -; . globl AES_CBC_decrypt +; void AES_CBC_decrypt(const unsigned char* in, +; unsigned char* out, +; unsigned char ivec[16], +; unsigned long length, +; const unsigned char* KS, +; int nr) AES_CBC_decrypt PROC -;# parameter 1: rdi -;# parameter 2: rsi -;# parameter 3: rdx -;# parameter 4: rcx -;# parameter 5: r8 -;# parameter 6: r9d +; parameter 1: rdi +; parameter 2: rsi +; parameter 3: rdx +; parameter 4: rcx +; parameter 5: r8 +; parameter 6: r9d -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,[rsp+40] - mov r9d,[rsp+48] - -; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - movdqa [rsp+64], xmm10 - movdqa [rsp+80], xmm11 - movdqa [rsp+96], xmm12 - movdqa [rsp+112], xmm15 - - mov r10,rcx - shr rcx,4 - shl r10,60 - je DNO_PARTS_4 - add rcx,1 + ; save rdi and rsi to rax and r11, restore before ret + mov rax, rdi + mov r11, rsi + ; convert to what we had for att&t convention + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx,r9 + mov r8, [rsp+40] + mov r9d, [rsp+48] + ; on microsoft xmm6-xmm15 are non volatile, + ; let's save on stack and restore at end + sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each + movdqa [rsp+0], xmm6 + movdqa [rsp+16], xmm7 + movdqa [rsp+32], xmm8 + movdqa [rsp+48], xmm9 + movdqa [rsp+64], xmm10 + movdqa [rsp+80], xmm11 + movdqa [rsp+96], xmm12 + movdqa [rsp+112], xmm15 + mov r10, rcx + shr rcx, 4 + shl r10, 60 + je DNO_PARTS_4 + add rcx, 1 DNO_PARTS_4: - mov r10,rcx - shl r10,62 - shr r10,62 - shr rcx,2 - movdqu xmm5,[rdx] - je DREMAINDER_4 - sub rsi,64 + mov r10, rcx + shl r10, 62 + shr r10, 62 + shr rcx, 2 + movdqu xmm5, [rdx] + je DREMAINDER_4 + sub rsi, 64 DLOOP_4: - movdqu xmm1,[rdi] - movdqu xmm2,16[rdi] - movdqu xmm3,32[rdi] - movdqu xmm4,48[rdi] - movdqa xmm6,xmm1 - movdqa xmm7,xmm2 - movdqa xmm8,xmm3 - movdqa xmm15,xmm4 - movdqa xmm9,[r8] - movdqa xmm10,16[r8] - movdqa xmm11,32[r8] - movdqa xmm12,48[r8] - pxor xmm1,xmm9 - pxor xmm2,xmm9 - pxor xmm3,xmm9 - - pxor xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - aesdec xmm1,xmm11 - aesdec xmm2,xmm11 - aesdec xmm3,xmm11 - aesdec xmm4,xmm11 - aesdec xmm1,xmm12 - aesdec xmm2,xmm12 - aesdec xmm3,xmm12 - aesdec xmm4,xmm12 - movdqa xmm9,64[r8] - movdqa xmm10,80[r8] - movdqa xmm11,96[r8] - movdqa xmm12,112[r8] - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - aesdec xmm1,xmm11 - aesdec xmm2,xmm11 - aesdec xmm3,xmm11 - aesdec xmm4,xmm11 - aesdec xmm1,xmm12 - aesdec xmm2,xmm12 - aesdec xmm3,xmm12 - aesdec xmm4,xmm12 - movdqa xmm9,128[r8] - movdqa xmm10,144[r8] - movdqa xmm11,160[r8] - cmp r9d,12 - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - jb DLAST_4 - movdqa xmm9,160[r8] - movdqa xmm10,176[r8] - movdqa xmm11,192[r8] - cmp r9d,14 - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - jb DLAST_4 - - movdqa xmm9,192[r8] - movdqa xmm10,208[r8] - movdqa xmm11,224[r8] - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 + movdqu xmm1, [rdi] + movdqu xmm2, 16[rdi] + movdqu xmm3, 32[rdi] + movdqu xmm4, 48[rdi] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + movdqa xmm8, xmm3 + movdqa xmm15, xmm4 + movdqa xmm9, [r8] + movdqa xmm10, 16[r8] + movdqa xmm11, 32[r8] + movdqa xmm12, 48[r8] + pxor xmm1, xmm9 + pxor xmm2, xmm9 + pxor xmm3, xmm9 + pxor xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm1, xmm12 + aesdec xmm2, xmm12 + aesdec xmm3, xmm12 + aesdec xmm4, xmm12 + movdqa xmm9, 64[r8] + movdqa xmm10, 80[r8] + movdqa xmm11, 96[r8] + movdqa xmm12, 112[r8] + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm1, xmm12 + aesdec xmm2, xmm12 + aesdec xmm3, xmm12 + aesdec xmm4, xmm12 + movdqa xmm9, 128[r8] + movdqa xmm10, 144[r8] + movdqa xmm11, 160[r8] + cmp r9d, 12 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + jb DLAST_4 + movdqa xmm9, 160[r8] + movdqa xmm10, 176[r8] + movdqa xmm11, 192[r8] + cmp r9d, 14 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + jb DLAST_4 + movdqa xmm9, 192[r8] + movdqa xmm10, 208[r8] + movdqa xmm11, 224[r8] + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 DLAST_4: - add rdi,64 - add rsi,64 - dec rcx - aesdeclast xmm1,xmm11 - aesdeclast xmm2,xmm11 - aesdeclast xmm3,xmm11 - aesdeclast xmm4,xmm11 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - pxor xmm4,xmm8 - movdqu [rsi],xmm1 - movdqu 16[rsi],xmm2 - movdqu 32[rsi],xmm3 - movdqu 48[rsi],xmm4 - movdqa xmm5,xmm15 - jne DLOOP_4 - add rsi,64 + add rdi, 64 + add rsi, 64 + dec rcx + aesdeclast xmm1, xmm11 + aesdeclast xmm2, xmm11 + aesdeclast xmm3, xmm11 + aesdeclast xmm4, xmm11 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + pxor xmm4, xmm8 + movdqu [rsi], xmm1 + movdqu 16[rsi], xmm2 + movdqu 32[rsi], xmm3 + movdqu 48[rsi], xmm4 + movdqa xmm5, xmm15 + jne DLOOP_4 + add rsi, 64 DREMAINDER_4: - cmp r10,0 - je DEND_4 + cmp r10, 0 + je DEND_4 DLOOP_4_2: - movdqu xmm1,[rdi] - movdqa xmm15,xmm1 - add rdi,16 - pxor xmm1,[r8] - movdqu xmm2,160[r8] - cmp r9d,12 - aesdec xmm1,16[r8] - aesdec xmm1,32[r8] - aesdec xmm1,48[r8] - aesdec xmm1,64[r8] - aesdec xmm1,80[r8] - aesdec xmm1,96[r8] - aesdec xmm1,112[r8] - aesdec xmm1,128[r8] - aesdec xmm1,144[r8] - jb DLAST_4_2 - movdqu xmm2,192[r8] - cmp r9d,14 - aesdec xmm1,160[r8] - aesdec xmm1,176[r8] - jb DLAST_4_2 - movdqu xmm2,224[r8] - aesdec xmm1,192[r8] - aesdec xmm1,208[r8] + movdqu xmm1, [rdi] + movdqa xmm15, xmm1 + add rdi, 16 + pxor xmm1, [r8] + movdqu xmm2, 160[r8] + cmp r9d, 12 + aesdec xmm1, 16[r8] + aesdec xmm1, 32[r8] + aesdec xmm1, 48[r8] + aesdec xmm1, 64[r8] + aesdec xmm1, 80[r8] + aesdec xmm1, 96[r8] + aesdec xmm1, 112[r8] + aesdec xmm1, 128[r8] + aesdec xmm1, 144[r8] + jb DLAST_4_2 + movdqu xmm2, 192[r8] + cmp r9d, 14 + aesdec xmm1, 160[r8] + aesdec xmm1, 176[r8] + jb DLAST_4_2 + movdqu xmm2, 224[r8] + aesdec xmm1, 192[r8] + aesdec xmm1, 208[r8] DLAST_4_2: - aesdeclast xmm1,xmm2 - pxor xmm1,xmm5 - movdqa xmm5,xmm15 - movdqu [rsi],xmm1 - - add rsi,16 - dec r10 - jne DLOOP_4_2 + aesdeclast xmm1, xmm2 + pxor xmm1, xmm5 + movdqa xmm5, xmm15 + movdqu [rsi], xmm1 + add rsi, 16 + dec r10 + jne DLOOP_4_2 DEND_4: - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - movdqa xmm10, [rsp+64] - movdqa xmm11, [rsp+80] - movdqa xmm12, [rsp+96] - movdqa xmm15, [rsp+112] - add rsp,8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each - ret + ; restore non volatile rdi,rsi + mov rdi, rax + mov rsi, r11 + ; restore non volatile xmms from stack + movdqa xmm6, [rsp+0] + movdqa xmm7, [rsp+16] + movdqa xmm8, [rsp+32] + movdqa xmm9, [rsp+48] + movdqa xmm10, [rsp+64] + movdqa xmm11, [rsp+80] + movdqa xmm12, [rsp+96] + movdqa xmm15, [rsp+112] + add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each + ret AES_CBC_decrypt ENDP + ; /* ; AES_ECB_encrypt[const ,unsigned char*in ; unsigned ,char*out diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s index 3120aef22..208e52dc5 100644 --- a/wolfcrypt/src/aes_asm.s +++ b/wolfcrypt/src/aes_asm.s @@ -105,165 +105,162 @@ AES_CBC_decrypt: # parameter 5: %r8 # parameter 6: %r9d -movq %rcx, %r10 -shrq $4, %rcx -shlq $60, %r10 -je DNO_PARTS_4 -addq $1, %rcx + movq %rcx, %r10 + shrq $4, %rcx + shlq $60, %r10 + je DNO_PARTS_4 + addq $1, %rcx DNO_PARTS_4: -movq %rcx, %r10 -shlq $62, %r10 -shrq $62, %r10 -shrq $2, %rcx -movdqu (%rdx),%xmm5 -je DREMAINDER_4 -subq $64, %rsi + movq %rcx, %r10 + shlq $62, %r10 + shrq $62, %r10 + shrq $2, %rcx + movdqu (%rdx),%xmm5 + je DREMAINDER_4 + subq $64, %rsi DLOOP_4: -movdqu (%rdi), %xmm1 -movdqu 16(%rdi), %xmm2 -movdqu 32(%rdi), %xmm3 -movdqu 48(%rdi), %xmm4 -movdqa %xmm1, %xmm6 -movdqa %xmm2, %xmm7 -movdqa %xmm3, %xmm8 -movdqa %xmm4, %xmm15 -movdqa (%r8), %xmm9 -movdqa 16(%r8), %xmm10 -movdqa 32(%r8), %xmm11 -movdqa 48(%r8), %xmm12 -pxor %xmm9, %xmm1 -pxor %xmm9, %xmm2 -pxor %xmm9, %xmm3 - -pxor %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 -aesdec %xmm11, %xmm1 -aesdec %xmm11, %xmm2 -aesdec %xmm11, %xmm3 -aesdec %xmm11, %xmm4 -aesdec %xmm12, %xmm1 -aesdec %xmm12, %xmm2 -aesdec %xmm12, %xmm3 -aesdec %xmm12, %xmm4 -movdqa 64(%r8), %xmm9 -movdqa 80(%r8), %xmm10 -movdqa 96(%r8), %xmm11 -movdqa 112(%r8), %xmm12 -aesdec %xmm9, %xmm1 -aesdec %xmm9, %xmm2 -aesdec %xmm9, %xmm3 -aesdec %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 -aesdec %xmm11, %xmm1 -aesdec %xmm11, %xmm2 -aesdec %xmm11, %xmm3 -aesdec %xmm11, %xmm4 -aesdec %xmm12, %xmm1 -aesdec %xmm12, %xmm2 -aesdec %xmm12, %xmm3 -aesdec %xmm12, %xmm4 -movdqa 128(%r8), %xmm9 -movdqa 144(%r8), %xmm10 -movdqa 160(%r8), %xmm11 -cmpl $12, %r9d -aesdec %xmm9, %xmm1 -aesdec %xmm9, %xmm2 -aesdec %xmm9, %xmm3 -aesdec %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 -jb DLAST_4 -movdqa 160(%r8), %xmm9 -movdqa 176(%r8), %xmm10 -movdqa 192(%r8), %xmm11 -cmpl $14, %r9d -aesdec %xmm9, %xmm1 -aesdec %xmm9, %xmm2 -aesdec %xmm9, %xmm3 -aesdec %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 -jb DLAST_4 - -movdqa 192(%r8), %xmm9 -movdqa 208(%r8), %xmm10 -movdqa 224(%r8), %xmm11 -aesdec %xmm9, %xmm1 -aesdec %xmm9, %xmm2 -aesdec %xmm9, %xmm3 -aesdec %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 + movdqu (%rdi), %xmm1 + movdqu 16(%rdi), %xmm2 + movdqu 32(%rdi), %xmm3 + movdqu 48(%rdi), %xmm4 + movdqa %xmm1, %xmm6 + movdqa %xmm2, %xmm7 + movdqa %xmm3, %xmm8 + movdqa %xmm4, %xmm15 + movdqa (%r8), %xmm9 + movdqa 16(%r8), %xmm10 + movdqa 32(%r8), %xmm11 + movdqa 48(%r8), %xmm12 + pxor %xmm9, %xmm1 + pxor %xmm9, %xmm2 + pxor %xmm9, %xmm3 + pxor %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm3 + aesdec %xmm12, %xmm4 + movdqa 64(%r8), %xmm9 + movdqa 80(%r8), %xmm10 + movdqa 96(%r8), %xmm11 + movdqa 112(%r8), %xmm12 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm3 + aesdec %xmm12, %xmm4 + movdqa 128(%r8), %xmm9 + movdqa 144(%r8), %xmm10 + movdqa 160(%r8), %xmm11 + cmpl $12, %r9d + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + jb DLAST_4 + movdqa 160(%r8), %xmm9 + movdqa 176(%r8), %xmm10 + movdqa 192(%r8), %xmm11 + cmpl $14, %r9d + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + jb DLAST_4 + movdqa 192(%r8), %xmm9 + movdqa 208(%r8), %xmm10 + movdqa 224(%r8), %xmm11 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 DLAST_4: -addq $64, %rdi -addq $64, %rsi -decq %rcx -aesdeclast %xmm11, %xmm1 -aesdeclast %xmm11, %xmm2 -aesdeclast %xmm11, %xmm3 -aesdeclast %xmm11, %xmm4 -pxor %xmm5 ,%xmm1 -pxor %xmm6 ,%xmm2 -pxor %xmm7 ,%xmm3 -pxor %xmm8 ,%xmm4 -movdqu %xmm1, (%rsi) -movdqu %xmm2, 16(%rsi) -movdqu %xmm3, 32(%rsi) -movdqu %xmm4, 48(%rsi) -movdqa %xmm15,%xmm5 -jne DLOOP_4 -addq $64, %rsi + addq $64, %rdi + addq $64, %rsi + decq %rcx + aesdeclast %xmm11, %xmm1 + aesdeclast %xmm11, %xmm2 + aesdeclast %xmm11, %xmm3 + aesdeclast %xmm11, %xmm4 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + pxor %xmm8, %xmm4 + movdqu %xmm1, (%rsi) + movdqu %xmm2, 16(%rsi) + movdqu %xmm3, 32(%rsi) + movdqu %xmm4, 48(%rsi) + movdqa %xmm15,%xmm5 + jne DLOOP_4 + addq $64, %rsi DREMAINDER_4: -cmpq $0, %r10 -je DEND_4 + cmpq $0, %r10 + je DEND_4 DLOOP_4_2: -movdqu (%rdi), %xmm1 -movdqa %xmm1 ,%xmm15 -addq $16, %rdi -pxor (%r8), %xmm1 -movdqu 160(%r8), %xmm2 -cmpl $12, %r9d -aesdec 16(%r8), %xmm1 -aesdec 32(%r8), %xmm1 -aesdec 48(%r8), %xmm1 -aesdec 64(%r8), %xmm1 -aesdec 80(%r8), %xmm1 -aesdec 96(%r8), %xmm1 -aesdec 112(%r8), %xmm1 -aesdec 128(%r8), %xmm1 -aesdec 144(%r8), %xmm1 -jb DLAST_4_2 -movdqu 192(%r8), %xmm2 -cmpl $14, %r9d -aesdec 160(%r8), %xmm1 -aesdec 176(%r8), %xmm1 -jb DLAST_4_2 -movdqu 224(%r8), %xmm2 -aesdec 192(%r8), %xmm1 -aesdec 208(%r8), %xmm1 + movdqu (%rdi), %xmm1 + movdqa %xmm1, %xmm15 + addq $16, %rdi + pxor (%r8), %xmm1 + movdqu 160(%r8), %xmm2 + cmpl $12, %r9d + aesdec 16(%r8), %xmm1 + aesdec 32(%r8), %xmm1 + aesdec 48(%r8), %xmm1 + aesdec 64(%r8), %xmm1 + aesdec 80(%r8), %xmm1 + aesdec 96(%r8), %xmm1 + aesdec 112(%r8), %xmm1 + aesdec 128(%r8), %xmm1 + aesdec 144(%r8), %xmm1 + jb DLAST_4_2 + movdqu 192(%r8), %xmm2 + cmpl $14, %r9d + aesdec 160(%r8), %xmm1 + aesdec 176(%r8), %xmm1 + jb DLAST_4_2 + movdqu 224(%r8), %xmm2 + aesdec 192(%r8), %xmm1 + aesdec 208(%r8), %xmm1 DLAST_4_2: -aesdeclast %xmm2, %xmm1 -pxor %xmm5, %xmm1 -movdqa %xmm15, %xmm5 -movdqu %xmm1, (%rsi) - -addq $16, %rsi -decq %r10 -jne DLOOP_4_2 + aesdeclast %xmm2, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm15, %xmm5 + movdqu %xmm1, (%rsi) + addq $16, %rsi + decq %r10 + jne DLOOP_4_2 DEND_4: -ret + ret /* @@ -453,7 +450,7 @@ EREMAINDER_6: je EEND_6 ELOOP_6_2: movdqu (%rdi), %xmm1 - movdqa %xmm1 ,%xmm10 + movdqa %xmm1, %xmm10 addq $16, %rdi pxor (%r8), %xmm1 movdqu 160(%r8), %xmm2 @@ -703,7 +700,7 @@ EREMAINDER_8: je EEND_8 ELOOP_8_2: movdqu (%rdi), %xmm1 - movdqa %xmm1 ,%xmm10 + movdqa %xmm1, %xmm10 addq $16, %rdi pxor (%r8), %xmm1 movdqu 160(%r8), %xmm2 From 13f002f186f440feda6033d0358b961981067200 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Wed, 13 Apr 2016 12:00:53 -0700 Subject: [PATCH 05/12] only compile in the AES-CBC decrypt-by-size variant selected by define, default by 8 blocks at a time --- wolfcrypt/src/aes.c | 10 +++++----- wolfcrypt/src/aes_asm.s | 12 ++++++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 3ce0fbcde..220eec17d 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -1094,12 +1094,12 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, XASM_LINK("AES_CBC_encrypt"); #ifdef HAVE_AES_DECRYPT -#if defined(HAVE_AES_DECRYPT_BY8) +#if defined(WOLFSSL_AESNI_BY4) void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned char* ivec, unsigned long length, const unsigned char* KS, int nr) - XASM_LINK("AES_CBC_decrypt_by8"); -#elif defined(HAVE_AES_DECRYPT_BY6) + XASM_LINK("AES_CBC_decrypt_by4"); +#elif defined(WOLFSSL_AESNI_BY6) void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned char* ivec, unsigned long length, const unsigned char* KS, int nr) @@ -1108,8 +1108,8 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned char* ivec, unsigned long length, const unsigned char* KS, int nr) - XASM_LINK("AES_CBC_decrypt"); -#endif /* HAVE_AES_DECRYPT_BYX */ + XASM_LINK("AES_CBC_decrypt_by8"); +#endif /* WOLFSSL_AESNI_BYx */ #endif /* HAVE_AES_DECRYPT */ #endif /* HAVE_AES_CBC */ diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s index 208e52dc5..85863d3d7 100644 --- a/wolfcrypt/src/aes_asm.s +++ b/wolfcrypt/src/aes_asm.s @@ -86,18 +86,18 @@ jne LOOP ret - +#if defined(WOLFSSL_AESNI_BY4) /* -AES_CBC_decrypt (const unsigned char *in, +AES_CBC_decrypt_by4 (const unsigned char *in, unsigned char *out, unsigned char ivec[16], unsigned long length, const unsigned char *KS, int nr) */ -.globl AES_CBC_decrypt -AES_CBC_decrypt: +.globl AES_CBC_decrypt_by4 +AES_CBC_decrypt_by4: # parameter 1: %rdi # parameter 2: %rsi # parameter 3: %rdx @@ -262,6 +262,7 @@ DLAST_4_2: DEND_4: ret +#elif defined(WOLFSSL_AESNI_BY6) /* AES_CBC_decrypt_by6 (const unsigned char *in, @@ -484,6 +485,7 @@ ELAST_6_2: EEND_6: ret +#else /* WOLFSSL_AESNI_BYx */ /* AES_CBC_decrypt_by8 (const unsigned char *in, @@ -734,6 +736,8 @@ ELAST_8_2: EEND_8: ret +#endif /* WOLFSSL_AESNI_BYx */ + /* AES_ECB_encrypt (const unsigned char *in, From 57fce855317e46f16c38da6033a5691a5b5b7474 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Wed, 13 Apr 2016 12:18:59 -0700 Subject: [PATCH 06/12] modify AES-CBC with AESNI test to check all key sizes for each message size checked --- wolfcrypt/test/test.c | 44 +++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 6d97aa0e1..0ce5fe997 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -2722,29 +2722,37 @@ int aes_test(void) 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20 }; + const byte bigKey[] = "0123456789abcdeffedcba9876543210"; byte bigCipher[sizeof(bigMsg)]; byte bigPlain[sizeof(bigMsg)]; - word32 i; + word32 keySz, msgSz; - for (i = AES_BLOCK_SIZE; i <= sizeof(bigMsg); i += AES_BLOCK_SIZE) { - memset(bigCipher, 0, sizeof(bigCipher)); - memset(bigPlain, 0, sizeof(bigPlain)); - ret = wc_AesSetKey(&enc, key, AES_BLOCK_SIZE, iv, AES_ENCRYPTION); - if (ret != 0) - return -1030; - ret = wc_AesSetKey(&dec, key, AES_BLOCK_SIZE, iv, AES_DECRYPTION); - if (ret != 0) - return -1031; + /* Iterate from one AES_BLOCK_SIZE of bigMsg through the whole + * message by AES_BLOCK_SIZE for each size of AES key. */ + for (keySz = 16; keySz <= 32; keySz += 8) { + for (msgSz = AES_BLOCK_SIZE; + msgSz <= sizeof(bigMsg); + msgSz += AES_BLOCK_SIZE) { - ret = wc_AesCbcEncrypt(&enc, bigCipher, bigMsg, i); - if (ret != 0) - return -1032; - ret = wc_AesCbcDecrypt(&dec, bigPlain, bigCipher, i); - if (ret != 0) - return -1033; + memset(bigCipher, 0, sizeof(bigCipher)); + memset(bigPlain, 0, sizeof(bigPlain)); + ret = wc_AesSetKey(&enc, bigKey, keySz, iv, AES_ENCRYPTION); + if (ret != 0) + return -1030; + ret = wc_AesSetKey(&dec, bigKey, keySz, iv, AES_DECRYPTION); + if (ret != 0) + return -1031; - if (memcmp(bigPlain, bigMsg, i)) - return -1034; + ret = wc_AesCbcEncrypt(&enc, bigCipher, bigMsg, msgSz); + if (ret != 0) + return -1032; + ret = wc_AesCbcDecrypt(&dec, bigPlain, bigCipher, msgSz); + if (ret != 0) + return -1033; + + if (memcmp(bigPlain, bigMsg, msgSz)) + return -1034; + } } } #endif /* WOLFSSL_AESNI HAVE_AES_DECRYPT */ From 9781fa3dc902b90923241afd51f5cfa5a7f8d602 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Wed, 13 Apr 2016 15:51:19 -0700 Subject: [PATCH 07/12] relabel jump points in new code with D (decrypt) mnemonics rather than E (encrypt) --- wolfcrypt/src/aes_asm.s | 64 ++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s index 85863d3d7..ac67a09ee 100644 --- a/wolfcrypt/src/aes_asm.s +++ b/wolfcrypt/src/aes_asm.s @@ -284,9 +284,9 @@ AES_CBC_decrypt_by6: movq %rcx, %r10 shrq $4, %rcx shlq $60, %r10 - je ENO_PARTS_6 + je DNO_PARTS_6 addq $1, %rcx -ENO_PARTS_6: +DNO_PARTS_6: movq %rax, %r12 movq %rdx, %r13 movq %rbx, %r14 @@ -301,9 +301,9 @@ ENO_PARTS_6: movq %r14, %rbx cmpq $0, %rcx movdqu (%rdx), %xmm7 - je EREMAINDER_6 + je DREMAINDER_6 subq $96, %rsi -ELOOP_6: +DLOOP_6: movdqu (%rdi), %xmm1 movdqu 16(%rdi), %xmm2 movdqu 32(%rdi), %xmm3 @@ -382,7 +382,7 @@ ELOOP_6: aesdec %xmm9, %xmm4 aesdec %xmm9, %xmm5 aesdec %xmm9, %xmm6 - jb ELAST_6 + jb DLAST_6 movdqa 160(%r8), %xmm8 movdqa 176(%r8), %xmm9 movdqa 192(%r8), %xmm10 @@ -399,7 +399,7 @@ ELOOP_6: aesdec %xmm9, %xmm4 aesdec %xmm9, %xmm5 aesdec %xmm9, %xmm6 - jb ELAST_6 + jb DLAST_6 movdqa 192(%r8), %xmm8 movdqa 208(%r8), %xmm9 movdqa 224(%r8), %xmm10 @@ -415,7 +415,7 @@ ELOOP_6: aesdec %xmm9, %xmm4 aesdec %xmm9, %xmm5 aesdec %xmm9, %xmm6 -ELAST_6: +DLAST_6: addq $96, %rsi aesdeclast %xmm10, %xmm1 aesdeclast %xmm10, %xmm2 @@ -444,12 +444,12 @@ ELAST_6: movdqu %xmm6, 80(%rsi) addq $96, %rdi decq %rcx - jne ELOOP_6 + jne DLOOP_6 addq $96, %rsi -EREMAINDER_6: +DREMAINDER_6: cmpq $0, %r10 - je EEND_6 -ELOOP_6_2: + je DEND_6 +DLOOP_6_2: movdqu (%rdi), %xmm1 movdqa %xmm1, %xmm10 addq $16, %rdi @@ -465,24 +465,24 @@ ELOOP_6_2: aesdec 112(%r8), %xmm1 aesdec 128(%r8), %xmm1 aesdec 144(%r8), %xmm1 - jb ELAST_6_2 + jb DLAST_6_2 movdqu 192(%r8), %xmm2 cmpl $14, %r9d aesdec 160(%r8), %xmm1 aesdec 176(%r8), %xmm1 - jb ELAST_6_2 + jb DLAST_6_2 movdqu 224(%r8), %xmm2 aesdec 192(%r8), %xmm1 aesdec 208(%r8), %xmm1 -ELAST_6_2: +DLAST_6_2: aesdeclast %xmm2, %xmm1 pxor %xmm7, %xmm1 movdqa %xmm10, %xmm7 movdqu %xmm1, (%rsi) addq $16, %rsi decq %r10 - jne ELOOP_6_2 -EEND_6: + jne DLOOP_6_2 +DEND_6: ret #else /* WOLFSSL_AESNI_BYx */ @@ -507,17 +507,17 @@ AES_CBC_decrypt_by8: movq %rcx, %r10 shrq $4, %rcx shlq $60, %r10 - je ENO_PARTS_8 + je DNO_PARTS_8 addq $1, %rcx -ENO_PARTS_8: +DNO_PARTS_8: movq %rcx, %r10 shlq $61, %r10 shrq $61, %r10 shrq $3, %rcx movdqu (%rdx), %xmm9 - je EREMAINDER_8 + je DREMAINDER_8 subq $128, %rsi -ELOOP_8: +DLOOP_8: movdqu (%rdi), %xmm1 movdqu 16(%rdi), %xmm2 movdqu 32(%rdi), %xmm3 @@ -618,7 +618,7 @@ ELOOP_8: aesdec %xmm11, %xmm6 aesdec %xmm11, %xmm7 aesdec %xmm11, %xmm8 - jb ELAST_8 + jb DLAST_8 movdqa 160(%r8), %xmm10 movdqa 176(%r8), %xmm11 movdqa 192(%r8), %xmm12 @@ -639,7 +639,7 @@ ELOOP_8: aesdec %xmm11, %xmm6 aesdec %xmm11, %xmm7 aesdec %xmm11, %xmm8 - jb ELAST_8 + jb DLAST_8 movdqa 192(%r8), %xmm10 movdqa 208(%r8), %xmm11 movdqa 224(%r8), %xmm12 @@ -659,7 +659,7 @@ ELOOP_8: aesdec %xmm11, %xmm6 aesdec %xmm11, %xmm7 aesdec %xmm11, %xmm8 -ELAST_8: +DLAST_8: addq $128, %rsi aesdeclast %xmm12, %xmm1 aesdeclast %xmm12, %xmm2 @@ -695,12 +695,12 @@ ELAST_8: movdqu %xmm8, 112(%rsi) addq $128, %rdi decq %rcx - jne ELOOP_8 + jne DLOOP_8 addq $128, %rsi -EREMAINDER_8: +DREMAINDER_8: cmpq $0, %r10 - je EEND_8 -ELOOP_8_2: + je DEND_8 +DLOOP_8_2: movdqu (%rdi), %xmm1 movdqa %xmm1, %xmm10 addq $16, %rdi @@ -716,24 +716,24 @@ ELOOP_8_2: aesdec 112(%r8), %xmm1 aesdec 128(%r8), %xmm1 aesdec 144(%r8), %xmm1 - jb ELAST_8_2 + jb DLAST_8_2 movdqu 192(%r8), %xmm2 cmpl $14, %r9d aesdec 160(%r8), %xmm1 aesdec 176(%r8), %xmm1 - jb ELAST_8_2 + jb DLAST_8_2 movdqu 224(%r8), %xmm2 aesdec 192(%r8), %xmm1 aesdec 208(%r8), %xmm1 -ELAST_8_2: +DLAST_8_2: aesdeclast %xmm2, %xmm1 pxor %xmm9, %xmm1 movdqa %xmm10, %xmm9 movdqu %xmm1, (%rsi) addq $16, %rsi decq %r10 - jne ELOOP_8_2 -EEND_8: + jne DLOOP_8_2 +DEND_8: ret #endif /* WOLFSSL_AESNI_BYx */ From c34944e389a1dc4b19b3cf48e3f16825ba562fc6 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Wed, 13 Apr 2016 16:02:18 -0700 Subject: [PATCH 08/12] added intel-format translations of new att-format AES-NI decrypt routines --- wolfcrypt/src/aes_asm.asm | 557 +++++++++++++++++++++++++++++++++++++- 1 file changed, 549 insertions(+), 8 deletions(-) diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 345229037..9bd29b88e 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -101,13 +101,13 @@ LAST: AES_CBC_encrypt ENDP -; void AES_CBC_decrypt(const unsigned char* in, -; unsigned char* out, -; unsigned char ivec[16], -; unsigned long length, -; const unsigned char* KS, -; int nr) -AES_CBC_decrypt PROC +; void AES_CBC_decrypt_by4(const unsigned char* in, +; unsigned char* out, +; unsigned char ivec[16], +; unsigned long length, +; const unsigned char* KS, +; int nr) +AES_CBC_decrypt_by4 PROC ; parameter 1: rdi ; parameter 2: rsi ; parameter 3: rdx @@ -136,6 +136,7 @@ AES_CBC_decrypt PROC movdqa [rsp+80], xmm11 movdqa [rsp+96], xmm12 movdqa [rsp+112], xmm15 + ; back to our original code, more or less mov r10, rcx shr rcx, 4 shl r10, 60 @@ -305,7 +306,547 @@ DEND_4: movdqa xmm15, [rsp+112] add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each ret -AES_CBC_decrypt ENDP +AES_CBC_decrypt_by4 ENDP + + +; void AES_CBC_decrypt_by6(const unsigned char *in, +; unsigned char *out, +; unsigned char ivec[16], +; unsigned long length, +; const unsigned char *KS, +; int nr) +AES_CBC_decrypt_by6 PROC +; parameter 1: rdi - in +; parameter 2: rsi - out +; parameter 3: rdx - ivec +; parameter 4: rcx - length +; parameter 5: r8 - KS +; parameter 6: r9d - nr + + ; save rdi and rsi to rax and r11, restore before ret + mov rax, rdi + mov r11, rsi + ; convert to what we had for att&t convention + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, [rsp+40] + mov r9d, [rsp+48] + ; on microsoft xmm6-xmm15 are non volatile, + ; let's save on stack and restore at end + sub rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each + movdqa [rsp+0], xmm6 + movdqa [rsp+16], xmm7 + movdqa [rsp+32], xmm8 + movdqa [rsp+48], xmm9 + movdqa [rsp+64], xmm10 + movdqa [rsp+80], xmm11 + movdqa [rsp+96], xmm12 + movdqa [rsp+112], xmm13 + movdqa [rsp+128], xmm14 + ; back to our original code, more or less + mov r10, rcx + shr rcx, 4 + shl r10, 60 + je DNO_PARTS_6 + add rcx, 1 +DNO_PARTS_6: + movq r12, rax + movq r13, rdx + movq r14, rbx + movq rdx, 0 + movq rax, %rcx + movq rbx, 6 + div rbx + movq rcx, rax + movq r10, rdx + movq rax, r12 + movq rdx, r13 + movq rbx, r14 + cmpq rcx, 0 + movdqu xmm7, [rdx] + je DREMAINDER_6 + subq rsi, 96 +DLOOP_6: + movdqu xmm1, [rdi] + movdqu xmm2, 16[rdi] + movdqu xmm3, 32[rdi] + movdqu xmm4, 48[rdi] + movdqu xmm5, 64[rdi] + movdqu xmm6, 80[rdi] + movdqa xmm8, [r8] + movdqa xmm9, 16[r8] + movdqa xmm10, 32[r8] + movdqa xmm11, 48[r8] + pxor xmm1, xmm8 + pxor xmm2, xmm8 + pxor xmm3, xmm8 + pxor xmm4, xmm8 + pxor xmm5, xmm8 + pxor xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + movdqa xmm8, 64[r8] + movdqa xmm9, 80[r8] + movdqa xmm10, 96[r8] + movdqa xmm11, 112[r8] + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + movdqa xmm8, 128[r8] + movdqa xmm9, 144[r8] + movdqa xmm10, 160[r8] + cmp r9d, 12 + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 + jb DLAST_6 + movdqa xmm8, 160[r8] + movdqa xmm9, 176[r8] + movdqa xmm10, 192[r8] + cmp r9d, 14 + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 + jb DLAST_6 + movdqa xmm8, 192[r8] + movdqa xmm9, 208[r8] + movdqa xmm10, 224[r8] + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 +DLAST_6: + add rsi, 96 + aesdeclast xmm1, xmm10 + aesdeclast xmm2, xmm10 + aesdeclast xmm3, xmm10 + aesdeclast xmm4, xmm10 + aesdeclast xmm5, xmm10 + aesdeclast xmm6, xmm10 + movdqu [rdi], xmm8 + movdqu 16[rdi], xmm9 + movdqu 32[rdi], xmm10 + movdqu 48[rdi], xmm11 + movdqu 64[rdi], xmm12 + movdqu 80[rdi], xmm13 + pxor xmm1, xmm7 + pxor xmm2, xmm8 + pxor xmm3, xmm9 + pxor xmm4, xmm10 + pxor xmm5, xmm11 + pxor xmm6, xmm12 + movdqu xmm7, xmm13 + movdqu [rsi], xmm1 + movdqu 16[rsi], xmm2 + movdqu 32[rsi], xmm3 + movdqu 48[rsi], xmm4 + movdqu 64[rsi], xmm5 + movdqu 80[rsi], xmm6 + add rdi, 96 + dec rcx + jne DLOOP_6 + add rsi, 96 +DREMAINDER_6: + cmp r10, 0 + je DEND_6 +DLOOP_6_2: + movdqu xmm1, [rdi] + movdqa xmm10, xmm1 + add rdi, 16 + pxor xmm1, [r8] + movdqu xmm2, 160[r8] + cmp r9d, 12 + aesdec xmm1, 16[r8] + aesdec xmm1, 32[r8] + aesdec xmm1, 48[r8] + aesdec xmm1, 64[r8] + aesdec xmm1, 80[r8] + aesdec xmm1, 96[r8] + aesdec xmm1, 112[r8] + aesdec xmm1, 128[r8] + aesdec xmm1, 144[r8] + jb DLAST_6_2 + movdqu xmm2, 192[r8] + cmp r9d, 14, r9d + aesdec xmm1, 160[r8] + aesdec xmm1, 176[r8] + jb DLAST_6_2 + movdqu xmm2, 224[r8] + aesdec xmm1, 192[r8] + aesdec xmm1, 208[r8] +DLAST_6_2: + aesdeclast xmm1, xmm2 + pxor xmm1, xmm7 + movdqa xmm7, xmm10 + movdqu [rsi], xmm1 + add rsi, 16 + dec r10 + jne DLOOP_6_2 +DEND_6: + ; restore non volatile rdi,rsi + mov rdi, rax + mov rsi, r11 + ; restore non volatile xmms from stack + movdqa xmm6, [rsp+0] + movdqa xmm7, [rsp+16] + movdqa xmm8, [rsp+32] + movdqa xmm9, [rsp+48] + movdqa xmm10, [rsp+64] + movdqa xmm11, [rsp+80] + movdqa xmm12, [rsp+96] + movdqa xmm13, [rsp+112] + movdqa xmm14, [rsp+128] + add rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each + ret +AES_CBC_decrypt_by6 ENDP + + +; void AES_CBC_decrypt_by8(const unsigned char *in, +; unsigned char *out, +; unsigned char ivec[16], +; unsigned long length, +; const unsigned char *KS, +; int nr) +AES_CBC_decrypt_by8 PROC +; parameter 1: rdi - in +; parameter 2: rsi - out +; parameter 3: rdx - ivec +; parameter 4: rcx - length +; parameter 5: r8 - KS +; parameter 6: r9d - nr + + ; save rdi and rsi to rax and r11, restore before ret + mov rax, rdi + mov r11, rsi + ; convert to what we had for att&t convention + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx,r9 + mov r8, [rsp+40] + mov r9d, [rsp+48] + ; on microsoft xmm6-xmm15 are non volatile, + ; let's save on stack and restore at end + sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each + movdqa [rsp+0], xmm6 + movdqa [rsp+16], xmm7 + movdqa [rsp+32], xmm8 + movdqa [rsp+48], xmm9 + movdqa [rsp+64], xmm10 + movdqa [rsp+80], xmm11 + movdqa [rsp+96], xmm12 + movdqa [rsp+112], xmm13 + ; back to our original code, more or less + mov r10, rcx + shr rcx, 4 + shl r10, 60 + je DNO_PARTS_8 + add rcx, 1 +DNO_PARTS_8: + mov r10, rcx + shl r10, 61 + shr r10, 61 + shr rcx, 3 + movdqu xmm9, [rdx] + je DREMAINDER_8 + sub rsi, 128 +DLOOP_8: + movdqu xmm1, [rdi] + movdqu xmm2, 16[rdi] + movdqu xmm3, 32[rdi] + movdqu xmm4, 48[rdi] + movdqu xmm5, 64[rdi] + movdqu xmm6, 80[rdi] + movdqu xmm7, 96[rdi] + movdqu xmm8, 112[rdi] + movdqa xmm10, [r8] + movdqa xmm11, 16[r8] + movdqa xmm12, 32[r8] + movdqa xmm13, 48[r8] + pxor xmm1, xmm10 + pxor xmm2, xmm10 + pxor xmm3, xmm10 + pxor xmm4, xmm10 + pxor xmm5, xmm10 + pxor xmm6, xmm10 + pxor xmm7, xmm10 + pxor xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 + aesdec xmm1, xmm12 + aesdec xmm2, xmm12 + aesdec xmm3, xmm12 + aesdec xmm4, xmm12 + aesdec xmm5, xmm12 + aesdec xmm6, xmm12 + aesdec xmm7, xmm12 + aesdec xmm8, xmm12 + aesdec xmm1, xmm13 + aesdec xmm2, xmm13 + aesdec xmm3, xmm13 + aesdec xmm4, xmm13 + aesdec xmm5, xmm13 + aesdec xmm6, xmm13 + aesdec xmm7, xmm13 + aesdec xmm8, xmm13 + movdqa xmm10, 64[r8] + movdqa xmm11, 80[r8] + movdqa xmm12, 96[r8] + movdqa xmm13, 112[r8] + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm7, xmm10 + aesdec xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 + aesdec xmm1, xmm12 + aesdec xmm2, xmm12 + aesdec xmm3, xmm12 + aesdec xmm4, xmm12 + aesdec xmm5, xmm12 + aesdec xmm6, xmm12 + aesdec xmm7, xmm12 + aesdec xmm8, xmm12 + aesdec xmm1, xmm13 + aesdec xmm2, xmm13 + aesdec xmm3, xmm13 + aesdec xmm4, xmm13 + aesdec xmm5, xmm13 + aesdec xmm6, xmm13 + aesdec xmm7, xmm13 + aesdec xmm8, xmm13 + movdqa xmm10, 128[r8] + movdqa xmm11, 144[r8] + movdqa xmm12, 160[r8] + cmp r9d, 12 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm7, xmm10 + aesdec xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 + jb DLAST_8 + movdqa xmm10, 160[r8] + movdqa xmm11, 176[r8] + movdqa xmm12, 192[r8] + cmp r9d, 14 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm7, xmm10 + aesdec xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 + jb DLAST_8 + movdqa xmm10, 192[r8] + movdqa xmm11, 208[r8] + movdqa xmm12, 224[r8] + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm7, xmm10 + aesdec xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 +DLAST_8: + add 128, rsi + aesdeclast xmm1, xmm12 + aesdeclast xmm2, xmm12 + aesdeclast xmm3, xmm12 + aesdeclast xmm4, xmm12 + aesdeclast xmm5, xmm12 + aesdeclast xmm6, xmm12 + aesdeclast xmm7, xmm12 + aesdeclast xmm8, xmm12 + movdqu xmm10, [rdi] + movdqu xmm11, 16[rdi] + movdqu xmm12, 32[rdi] + movdqu xmm13, 48[rdi] + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + movdqu xmm10, 64[rdi] + movdqu xmm11, 80[rdi] + movdqu xmm12, 96[rdi] + movdqu xmm9, 112[rdi] + pxor xmm6, xmm10 + pxor xmm7, xmm11 + pxor xmm8, xmm12 + movdqu [rsi], xmm1 + movdqu 16[rsi], xmm2 + movdqu 32[rsi], xmm3 + movdqu 48[rsi], xmm4 + movdqu 64[rsi], xmm5 + movdqu 80[rsi], xmm6 + movdqu 96[rsi], xmm7 + movdqu 112[rsi], xmm8 + add rdi, 128 + dec rcx + jne DLOOP_8 + add rsi, 128 +DREMAINDER_8: + cmp r10, 0 + je DEND_8 +DLOOP_8_2: + movdqu xmm1, [rdi] + movdqa xmm10, xmm1 + add rdi, 16 + pxor xmm1, [r8] + movdqu xmm2, 160[r8] + cmp r9d, 12 + aesdec xmm1, 16[r8] + aesdec xmm1, 32[r8] + aesdec xmm1, 48[r8] + aesdec xmm1, 64[r8] + aesdec xmm1, 80[r8] + aesdec xmm1, 96[r8] + aesdec xmm1, 112[r8] + aesdec xmm1, 128[r8] + aesdec xmm1, 144[r8] + jb DLAST_8_2 + movdqu xmm2, 192[r8] + cmp r9d, 14 + aesdec xmm1, 160[r8] + aesdec xmm1, 176[r8] + jb DLAST_8_2 + movdqu xmm2, 224[r8] + aesdec xmm1, 192[r8] + aesdec xmm1, 208[r8] +DLAST_8_2: + aesdeclast xmm1, xmm2 + pxor xmm1, xmm9 + movdqa xmm9, xmm10 + movdqu [rsi], xmm1 + add rsi, 16 + dec r10 + jne DLOOP_8_2 +DEND_8: + ; restore non volatile rdi,rsi + mov rdi, rax + mov rsi, r11 + ; restore non volatile xmms from stack + movdqa xmm6, [rsp+0] + movdqa xmm7, [rsp+16] + movdqa xmm8, [rsp+32] + movdqa xmm9, [rsp+48] + movdqa xmm10, [rsp+64] + movdqa xmm11, [rsp+80] + movdqa xmm12, [rsp+96] + movdqa xmm13, [rsp+112] + add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each + ret +AES_CBC_decrypt_by6 ENDP ; /* From 6f51c2a8f8c023130b10567db61e4d207a20351d Mon Sep 17 00:00:00 2001 From: John Safranek Date: Wed, 13 Apr 2016 16:42:58 -0700 Subject: [PATCH 09/12] 1. Fix bad opcode mnemonics in the intel-format source listing. 2. Update the aes.c file to call both format assembly routines the same way. --- wolfcrypt/src/aes.c | 42 +++++++++++++++++++++++---------------- wolfcrypt/src/aes_asm.asm | 32 ++++++++++++++--------------- 2 files changed, 41 insertions(+), 33 deletions(-) diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 220eec17d..6d5d9fa96 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -1094,22 +1094,22 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, XASM_LINK("AES_CBC_encrypt"); #ifdef HAVE_AES_DECRYPT -#if defined(WOLFSSL_AESNI_BY4) -void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, - unsigned char* ivec, unsigned long length, - const unsigned char* KS, int nr) - XASM_LINK("AES_CBC_decrypt_by4"); -#elif defined(WOLFSSL_AESNI_BY6) -void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, - unsigned char* ivec, unsigned long length, - const unsigned char* KS, int nr) - XASM_LINK("AES_CBC_decrypt_by6"); -#else -void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, - unsigned char* ivec, unsigned long length, - const unsigned char* KS, int nr) - XASM_LINK("AES_CBC_decrypt_by8"); -#endif /* WOLFSSL_AESNI_BYx */ + #if defined(WOLFSSL_AESNI_BY4) + void AES_CBC_decrypt_by4(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_by4"); + #elif defined(WOLFSSL_AESNI_BY6) + void AES_CBC_decrypt_by6(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_by6"); + #else /* WOLFSSL_AESNI_BYx */ + void AES_CBC_decrypt_by8(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_by8"); + #endif /* WOLFSSL_AESNI_BYx */ #endif /* HAVE_AES_DECRYPT */ #endif /* HAVE_AES_CBC */ @@ -2561,8 +2561,16 @@ int wc_AesSetIV(Aes* aes, const byte* iv) /* if input and output same will overwrite input iv */ XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); - AES_CBC_decrypt(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + #if defined(WOLFSSL_AESNI_BY4) + AES_CBC_decrypt_by4(in, out, (byte*)aes->reg, sz, (byte*)aes->key, aes->rounds); + #elif defined(WOLFSSL_AESNI_BY6) + AES_CBC_decrypt_by6(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + aes->rounds); + #else /* WOLFSSL_AESNI_BYx */ + AES_CBC_decrypt_by8(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + aes->rounds); + #endif /* WOLFSSL_AESNI_BYx */ /* store iv for next call */ XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE); return 0; diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 9bd29b88e..cd3187794 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -352,22 +352,22 @@ AES_CBC_decrypt_by6 PROC je DNO_PARTS_6 add rcx, 1 DNO_PARTS_6: - movq r12, rax - movq r13, rdx - movq r14, rbx - movq rdx, 0 - movq rax, %rcx - movq rbx, 6 + mov r12, rax + mov r13, rdx + mov r14, rbx + mov rdx, 0 + mov rax, rcx + mov rbx, 6 div rbx - movq rcx, rax - movq r10, rdx - movq rax, r12 - movq rdx, r13 - movq rbx, r14 - cmpq rcx, 0 + mov rcx, rax + mov r10, rdx + mov rax, r12 + mov rdx, r13 + mov rbx, r14 + cmp rcx, 0 movdqu xmm7, [rdx] je DREMAINDER_6 - subq rsi, 96 + sub rsi, 96 DLOOP_6: movdqu xmm1, [rdi] movdqu xmm2, 16[rdi] @@ -532,7 +532,7 @@ DLOOP_6_2: aesdec xmm1, 144[r8] jb DLAST_6_2 movdqu xmm2, 192[r8] - cmp r9d, 14, r9d + cmp r9d, 14 aesdec xmm1, 160[r8] aesdec xmm1, 176[r8] jb DLAST_6_2 @@ -758,7 +758,7 @@ DLOOP_8: aesdec xmm7, xmm11 aesdec xmm8, xmm11 DLAST_8: - add 128, rsi + add rsi, 128 aesdeclast xmm1, xmm12 aesdeclast xmm2, xmm12 aesdeclast xmm3, xmm12 @@ -846,7 +846,7 @@ DEND_8: movdqa xmm13, [rsp+112] add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each ret -AES_CBC_decrypt_by6 ENDP +AES_CBC_decrypt_by8 ENDP ; /* From 5340ea0d796ee3a934f7508c4489f999acc902af Mon Sep 17 00:00:00 2001 From: John Safranek Date: Thu, 14 Apr 2016 10:47:14 -0700 Subject: [PATCH 10/12] fixed a missing operand swap in the AES-CBC decrypt by 6 --- wolfcrypt/src/aes_asm.asm | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index cd3187794..6fe026d5c 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -488,12 +488,12 @@ DLAST_6: aesdeclast xmm4, xmm10 aesdeclast xmm5, xmm10 aesdeclast xmm6, xmm10 - movdqu [rdi], xmm8 - movdqu 16[rdi], xmm9 - movdqu 32[rdi], xmm10 - movdqu 48[rdi], xmm11 - movdqu 64[rdi], xmm12 - movdqu 80[rdi], xmm13 + movdqu xmm8, [rdi] + movdqu xmm9, 16[rdi] + movdqu xmm10, 32[rdi] + movdqu xmm11, 48[rdi] + movdqu xmm12, 64[rdi] + movdqu xmm13, 80[rdi] pxor xmm1, xmm7 pxor xmm2, xmm8 pxor xmm3, xmm9 From 526606e42fb9b08c64af2f411071fb76ac355787 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Thu, 14 Apr 2016 11:30:10 -0700 Subject: [PATCH 11/12] added conditional assembly for the intel-format AES_NI decrypt procedures --- wolfcrypt/src/aes_asm.asm | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 6fe026d5c..cd7bc78a1 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -100,6 +100,7 @@ LAST: ret AES_CBC_encrypt ENDP +IFDEF WOLFSSL_AESNI_BY4 ; void AES_CBC_decrypt_by4(const unsigned char* in, ; unsigned char* out, @@ -308,6 +309,8 @@ DEND_4: ret AES_CBC_decrypt_by4 ENDP +ELSE +IFDEF WOLFSSL_AESNI_BY6 ; void AES_CBC_decrypt_by6(const unsigned char *in, ; unsigned char *out, @@ -565,6 +568,7 @@ DEND_6: ret AES_CBC_decrypt_by6 ENDP +ELSE ; void AES_CBC_decrypt_by8(const unsigned char *in, ; unsigned char *out, @@ -848,6 +852,8 @@ DEND_8: ret AES_CBC_decrypt_by8 ENDP +ENDIF +ENDIF ; /* ; AES_ECB_encrypt[const ,unsigned char*in From 4506839c6d2592018a79847043561c54a81969e7 Mon Sep 17 00:00:00 2001 From: John Safranek Date: Thu, 14 Apr 2016 11:57:42 -0700 Subject: [PATCH 12/12] back out last commit. it was a pain to use. --- wolfcrypt/src/aes_asm.asm | 6 ------ 1 file changed, 6 deletions(-) diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index cd7bc78a1..6fe026d5c 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -100,7 +100,6 @@ LAST: ret AES_CBC_encrypt ENDP -IFDEF WOLFSSL_AESNI_BY4 ; void AES_CBC_decrypt_by4(const unsigned char* in, ; unsigned char* out, @@ -309,8 +308,6 @@ DEND_4: ret AES_CBC_decrypt_by4 ENDP -ELSE -IFDEF WOLFSSL_AESNI_BY6 ; void AES_CBC_decrypt_by6(const unsigned char *in, ; unsigned char *out, @@ -568,7 +565,6 @@ DEND_6: ret AES_CBC_decrypt_by6 ENDP -ELSE ; void AES_CBC_decrypt_by8(const unsigned char *in, ; unsigned char *out, @@ -852,8 +848,6 @@ DEND_8: ret AES_CBC_decrypt_by8 ENDP -ENDIF -ENDIF ; /* ; AES_ECB_encrypt[const ,unsigned char*in