diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index fbb79c3e7..76cd20f71 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -525,7 +525,41 @@ void bench_aes(int show) #endif if (show) { - printf("AES %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, + printf("AES enc %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, + blockType, total, persec); + SHOW_INTEL_CYCLES + printf("\n"); + } +#ifdef HAVE_CAVIUM + wc_AesFreeCavium(&enc); + if (wc_AesInitCavium(&enc, CAVIUM_DEV_ID) != 0) { + printf("aes init cavium failed\n"); + return; + } +#endif + + ret = wc_AesSetKey(&enc, key, 16, iv, AES_DECRYPTION); + if (ret != 0) { + printf("AesSetKey failed, ret = %d\n", ret); + return; + } + start = current_time(1); + BEGIN_INTEL_CYCLES + + for(i = 0; i < numBlocks; i++) + wc_AesCbcDecrypt(&enc, plain, cipher, sizeof(plain)); + + END_INTEL_CYCLES + total = current_time(0) - start; + + persec = 1 / total * numBlocks; +#ifdef BENCH_EMBEDDED + /* since using kB, convert to MB/s */ + persec = persec / 1024; +#endif + + if (show) { + printf("AES dec %d %s took %5.3f seconds, %8.3f MB/s", numBlocks, blockType, total, persec); SHOW_INTEL_CYCLES printf("\n"); diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 6d6eae21e..6d5d9fa96 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -1094,10 +1094,22 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, XASM_LINK("AES_CBC_encrypt"); #ifdef HAVE_AES_DECRYPT -void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, - unsigned char* ivec, unsigned long length, - const unsigned char* KS, int nr) - XASM_LINK("AES_CBC_decrypt"); + #if defined(WOLFSSL_AESNI_BY4) + void AES_CBC_decrypt_by4(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_by4"); + #elif defined(WOLFSSL_AESNI_BY6) + void AES_CBC_decrypt_by6(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_by6"); + #else /* WOLFSSL_AESNI_BYx */ + void AES_CBC_decrypt_by8(const unsigned char* in, unsigned char* out, + unsigned char* ivec, unsigned long length, + const unsigned char* KS, int nr) + XASM_LINK("AES_CBC_decrypt_by8"); + #endif /* WOLFSSL_AESNI_BYx */ #endif /* HAVE_AES_DECRYPT */ #endif /* HAVE_AES_CBC */ @@ -2549,8 +2561,16 @@ int wc_AesSetIV(Aes* aes, const byte* iv) /* if input and output same will overwrite input iv */ XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE); - AES_CBC_decrypt(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + #if defined(WOLFSSL_AESNI_BY4) + AES_CBC_decrypt_by4(in, out, (byte*)aes->reg, sz, (byte*)aes->key, aes->rounds); + #elif defined(WOLFSSL_AESNI_BY6) + AES_CBC_decrypt_by6(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + aes->rounds); + #else /* WOLFSSL_AESNI_BYx */ + AES_CBC_decrypt_by8(in, out, (byte*)aes->reg, sz, (byte*)aes->key, + aes->rounds); + #endif /* WOLFSSL_AESNI_BYx */ /* store iv for next call */ XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE); return 0; diff --git a/wolfcrypt/src/aes_asm.asm b/wolfcrypt/src/aes_asm.asm index 5453d2e45..6fe026d5c 100644 --- a/wolfcrypt/src/aes_asm.asm +++ b/wolfcrypt/src/aes_asm.asm @@ -101,220 +101,753 @@ LAST: AES_CBC_encrypt ENDP +; void AES_CBC_decrypt_by4(const unsigned char* in, +; unsigned char* out, +; unsigned char ivec[16], +; unsigned long length, +; const unsigned char* KS, +; int nr) +AES_CBC_decrypt_by4 PROC +; parameter 1: rdi +; parameter 2: rsi +; parameter 3: rdx +; parameter 4: rcx +; parameter 5: r8 +; parameter 6: r9d -; /* -; AES_CBC_decrypt[const ,unsigned char*in -; unsigned ,char*out -; unsigned ,char ivec+16 -; unsigned ,long length -; const ,unsigned char*KS -; int nr] -; */ -; . globl AES_CBC_decrypt -AES_CBC_decrypt PROC -;# parameter 1: rdi -;# parameter 2: rsi -;# parameter 3: rdx -;# parameter 4: rcx -;# parameter 5: r8 -;# parameter 6: r9d - -; save rdi and rsi to rax and r11, restore before ret - mov rax,rdi - mov r11,rsi - -; convert to what we had for att&t convention - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,[rsp+40] - mov r9d,[rsp+48] - -; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end - sub rsp,8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each - movdqa [rsp+0], xmm6 - movdqa [rsp+16], xmm7 - movdqa [rsp+32], xmm8 - movdqa [rsp+48], xmm9 - movdqa [rsp+64], xmm10 - movdqa [rsp+80], xmm11 - movdqa [rsp+96], xmm12 - movdqa [rsp+112], xmm15 - - mov r10,rcx - shr rcx,4 - shl r10,60 - je DNO_PARTS_4 - add rcx,1 + ; save rdi and rsi to rax and r11, restore before ret + mov rax, rdi + mov r11, rsi + ; convert to what we had for att&t convention + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx,r9 + mov r8, [rsp+40] + mov r9d, [rsp+48] + ; on microsoft xmm6-xmm15 are non volatile, + ; let's save on stack and restore at end + sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each + movdqa [rsp+0], xmm6 + movdqa [rsp+16], xmm7 + movdqa [rsp+32], xmm8 + movdqa [rsp+48], xmm9 + movdqa [rsp+64], xmm10 + movdqa [rsp+80], xmm11 + movdqa [rsp+96], xmm12 + movdqa [rsp+112], xmm15 + ; back to our original code, more or less + mov r10, rcx + shr rcx, 4 + shl r10, 60 + je DNO_PARTS_4 + add rcx, 1 DNO_PARTS_4: - mov r10,rcx - shl r10,62 - shr r10,62 - shr rcx,2 - movdqu xmm5,[rdx] - je DREMAINDER_4 - sub rsi,64 + mov r10, rcx + shl r10, 62 + shr r10, 62 + shr rcx, 2 + movdqu xmm5, [rdx] + je DREMAINDER_4 + sub rsi, 64 DLOOP_4: - movdqu xmm1,[rdi] - movdqu xmm2,16[rdi] - movdqu xmm3,32[rdi] - movdqu xmm4,48[rdi] - movdqa xmm6,xmm1 - movdqa xmm7,xmm2 - movdqa xmm8,xmm3 - movdqa xmm15,xmm4 - movdqa xmm9,[r8] - movdqa xmm10,16[r8] - movdqa xmm11,32[r8] - movdqa xmm12,48[r8] - pxor xmm1,xmm9 - pxor xmm2,xmm9 - pxor xmm3,xmm9 - - pxor xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - aesdec xmm1,xmm11 - aesdec xmm2,xmm11 - aesdec xmm3,xmm11 - aesdec xmm4,xmm11 - aesdec xmm1,xmm12 - aesdec xmm2,xmm12 - aesdec xmm3,xmm12 - aesdec xmm4,xmm12 - movdqa xmm9,64[r8] - movdqa xmm10,80[r8] - movdqa xmm11,96[r8] - movdqa xmm12,112[r8] - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - aesdec xmm1,xmm11 - aesdec xmm2,xmm11 - aesdec xmm3,xmm11 - aesdec xmm4,xmm11 - aesdec xmm1,xmm12 - aesdec xmm2,xmm12 - aesdec xmm3,xmm12 - aesdec xmm4,xmm12 - movdqa xmm9,128[r8] - movdqa xmm10,144[r8] - movdqa xmm11,160[r8] - cmp r9d,12 - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - jb DLAST_4 - movdqa xmm9,160[r8] - movdqa xmm10,176[r8] - movdqa xmm11,192[r8] - cmp r9d,14 - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 - jb DLAST_4 - - movdqa xmm9,192[r8] - movdqa xmm10,208[r8] - movdqa xmm11,224[r8] - aesdec xmm1,xmm9 - aesdec xmm2,xmm9 - aesdec xmm3,xmm9 - aesdec xmm4,xmm9 - aesdec xmm1,xmm10 - aesdec xmm2,xmm10 - aesdec xmm3,xmm10 - aesdec xmm4,xmm10 + movdqu xmm1, [rdi] + movdqu xmm2, 16[rdi] + movdqu xmm3, 32[rdi] + movdqu xmm4, 48[rdi] + movdqa xmm6, xmm1 + movdqa xmm7, xmm2 + movdqa xmm8, xmm3 + movdqa xmm15, xmm4 + movdqa xmm9, [r8] + movdqa xmm10, 16[r8] + movdqa xmm11, 32[r8] + movdqa xmm12, 48[r8] + pxor xmm1, xmm9 + pxor xmm2, xmm9 + pxor xmm3, xmm9 + pxor xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm1, xmm12 + aesdec xmm2, xmm12 + aesdec xmm3, xmm12 + aesdec xmm4, xmm12 + movdqa xmm9, 64[r8] + movdqa xmm10, 80[r8] + movdqa xmm11, 96[r8] + movdqa xmm12, 112[r8] + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm1, xmm12 + aesdec xmm2, xmm12 + aesdec xmm3, xmm12 + aesdec xmm4, xmm12 + movdqa xmm9, 128[r8] + movdqa xmm10, 144[r8] + movdqa xmm11, 160[r8] + cmp r9d, 12 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + jb DLAST_4 + movdqa xmm9, 160[r8] + movdqa xmm10, 176[r8] + movdqa xmm11, 192[r8] + cmp r9d, 14 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + jb DLAST_4 + movdqa xmm9, 192[r8] + movdqa xmm10, 208[r8] + movdqa xmm11, 224[r8] + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 DLAST_4: - add rdi,64 - add rsi,64 - dec rcx - aesdeclast xmm1,xmm11 - aesdeclast xmm2,xmm11 - aesdeclast xmm3,xmm11 - aesdeclast xmm4,xmm11 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - pxor xmm4,xmm8 - movdqu [rsi],xmm1 - movdqu 16[rsi],xmm2 - movdqu 32[rsi],xmm3 - movdqu 48[rsi],xmm4 - movdqa xmm5,xmm15 - jne DLOOP_4 - add rsi,64 + add rdi, 64 + add rsi, 64 + dec rcx + aesdeclast xmm1, xmm11 + aesdeclast xmm2, xmm11 + aesdeclast xmm3, xmm11 + aesdeclast xmm4, xmm11 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pxor xmm3, xmm7 + pxor xmm4, xmm8 + movdqu [rsi], xmm1 + movdqu 16[rsi], xmm2 + movdqu 32[rsi], xmm3 + movdqu 48[rsi], xmm4 + movdqa xmm5, xmm15 + jne DLOOP_4 + add rsi, 64 DREMAINDER_4: - cmp r10,0 - je DEND_4 + cmp r10, 0 + je DEND_4 DLOOP_4_2: - movdqu xmm1,[rdi] - movdqa xmm15,xmm1 - add rdi,16 - pxor xmm1,[r8] - movdqu xmm2,160[r8] - cmp r9d,12 - aesdec xmm1,16[r8] - aesdec xmm1,32[r8] - aesdec xmm1,48[r8] - aesdec xmm1,64[r8] - aesdec xmm1,80[r8] - aesdec xmm1,96[r8] - aesdec xmm1,112[r8] - aesdec xmm1,128[r8] - aesdec xmm1,144[r8] - jb DLAST_4_2 - movdqu xmm2,192[r8] - cmp r9d,14 - aesdec xmm1,160[r8] - aesdec xmm1,176[r8] - jb DLAST_4_2 - movdqu xmm2,224[r8] - aesdec xmm1,192[r8] - aesdec xmm1,208[r8] + movdqu xmm1, [rdi] + movdqa xmm15, xmm1 + add rdi, 16 + pxor xmm1, [r8] + movdqu xmm2, 160[r8] + cmp r9d, 12 + aesdec xmm1, 16[r8] + aesdec xmm1, 32[r8] + aesdec xmm1, 48[r8] + aesdec xmm1, 64[r8] + aesdec xmm1, 80[r8] + aesdec xmm1, 96[r8] + aesdec xmm1, 112[r8] + aesdec xmm1, 128[r8] + aesdec xmm1, 144[r8] + jb DLAST_4_2 + movdqu xmm2, 192[r8] + cmp r9d, 14 + aesdec xmm1, 160[r8] + aesdec xmm1, 176[r8] + jb DLAST_4_2 + movdqu xmm2, 224[r8] + aesdec xmm1, 192[r8] + aesdec xmm1, 208[r8] DLAST_4_2: - aesdeclast xmm1,xmm2 - pxor xmm1,xmm5 - movdqa xmm5,xmm15 - movdqu [rsi],xmm1 - - add rsi,16 - dec r10 - jne DLOOP_4_2 + aesdeclast xmm1, xmm2 + pxor xmm1, xmm5 + movdqa xmm5, xmm15 + movdqu [rsi], xmm1 + add rsi, 16 + dec r10 + jne DLOOP_4_2 DEND_4: - ; restore non volatile rdi,rsi - mov rdi,rax - mov rsi,r11 - ; restore non volatile xmms from stack - movdqa xmm6, [rsp+0] - movdqa xmm7, [rsp+16] - movdqa xmm8, [rsp+32] - movdqa xmm9, [rsp+48] - movdqa xmm10, [rsp+64] - movdqa xmm11, [rsp+80] - movdqa xmm12, [rsp+96] - movdqa xmm15, [rsp+112] - add rsp,8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each - ret -AES_CBC_decrypt ENDP + ; restore non volatile rdi,rsi + mov rdi, rax + mov rsi, r11 + ; restore non volatile xmms from stack + movdqa xmm6, [rsp+0] + movdqa xmm7, [rsp+16] + movdqa xmm8, [rsp+32] + movdqa xmm9, [rsp+48] + movdqa xmm10, [rsp+64] + movdqa xmm11, [rsp+80] + movdqa xmm12, [rsp+96] + movdqa xmm15, [rsp+112] + add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each + ret +AES_CBC_decrypt_by4 ENDP + + +; void AES_CBC_decrypt_by6(const unsigned char *in, +; unsigned char *out, +; unsigned char ivec[16], +; unsigned long length, +; const unsigned char *KS, +; int nr) +AES_CBC_decrypt_by6 PROC +; parameter 1: rdi - in +; parameter 2: rsi - out +; parameter 3: rdx - ivec +; parameter 4: rcx - length +; parameter 5: r8 - KS +; parameter 6: r9d - nr + + ; save rdi and rsi to rax and r11, restore before ret + mov rax, rdi + mov r11, rsi + ; convert to what we had for att&t convention + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, [rsp+40] + mov r9d, [rsp+48] + ; on microsoft xmm6-xmm15 are non volatile, + ; let's save on stack and restore at end + sub rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each + movdqa [rsp+0], xmm6 + movdqa [rsp+16], xmm7 + movdqa [rsp+32], xmm8 + movdqa [rsp+48], xmm9 + movdqa [rsp+64], xmm10 + movdqa [rsp+80], xmm11 + movdqa [rsp+96], xmm12 + movdqa [rsp+112], xmm13 + movdqa [rsp+128], xmm14 + ; back to our original code, more or less + mov r10, rcx + shr rcx, 4 + shl r10, 60 + je DNO_PARTS_6 + add rcx, 1 +DNO_PARTS_6: + mov r12, rax + mov r13, rdx + mov r14, rbx + mov rdx, 0 + mov rax, rcx + mov rbx, 6 + div rbx + mov rcx, rax + mov r10, rdx + mov rax, r12 + mov rdx, r13 + mov rbx, r14 + cmp rcx, 0 + movdqu xmm7, [rdx] + je DREMAINDER_6 + sub rsi, 96 +DLOOP_6: + movdqu xmm1, [rdi] + movdqu xmm2, 16[rdi] + movdqu xmm3, 32[rdi] + movdqu xmm4, 48[rdi] + movdqu xmm5, 64[rdi] + movdqu xmm6, 80[rdi] + movdqa xmm8, [r8] + movdqa xmm9, 16[r8] + movdqa xmm10, 32[r8] + movdqa xmm11, 48[r8] + pxor xmm1, xmm8 + pxor xmm2, xmm8 + pxor xmm3, xmm8 + pxor xmm4, xmm8 + pxor xmm5, xmm8 + pxor xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + movdqa xmm8, 64[r8] + movdqa xmm9, 80[r8] + movdqa xmm10, 96[r8] + movdqa xmm11, 112[r8] + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + movdqa xmm8, 128[r8] + movdqa xmm9, 144[r8] + movdqa xmm10, 160[r8] + cmp r9d, 12 + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 + jb DLAST_6 + movdqa xmm8, 160[r8] + movdqa xmm9, 176[r8] + movdqa xmm10, 192[r8] + cmp r9d, 14 + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 + jb DLAST_6 + movdqa xmm8, 192[r8] + movdqa xmm9, 208[r8] + movdqa xmm10, 224[r8] + aesdec xmm1, xmm8 + aesdec xmm2, xmm8 + aesdec xmm3, xmm8 + aesdec xmm4, xmm8 + aesdec xmm5, xmm8 + aesdec xmm6, xmm8 + aesdec xmm1, xmm9 + aesdec xmm2, xmm9 + aesdec xmm3, xmm9 + aesdec xmm4, xmm9 + aesdec xmm5, xmm9 + aesdec xmm6, xmm9 +DLAST_6: + add rsi, 96 + aesdeclast xmm1, xmm10 + aesdeclast xmm2, xmm10 + aesdeclast xmm3, xmm10 + aesdeclast xmm4, xmm10 + aesdeclast xmm5, xmm10 + aesdeclast xmm6, xmm10 + movdqu xmm8, [rdi] + movdqu xmm9, 16[rdi] + movdqu xmm10, 32[rdi] + movdqu xmm11, 48[rdi] + movdqu xmm12, 64[rdi] + movdqu xmm13, 80[rdi] + pxor xmm1, xmm7 + pxor xmm2, xmm8 + pxor xmm3, xmm9 + pxor xmm4, xmm10 + pxor xmm5, xmm11 + pxor xmm6, xmm12 + movdqu xmm7, xmm13 + movdqu [rsi], xmm1 + movdqu 16[rsi], xmm2 + movdqu 32[rsi], xmm3 + movdqu 48[rsi], xmm4 + movdqu 64[rsi], xmm5 + movdqu 80[rsi], xmm6 + add rdi, 96 + dec rcx + jne DLOOP_6 + add rsi, 96 +DREMAINDER_6: + cmp r10, 0 + je DEND_6 +DLOOP_6_2: + movdqu xmm1, [rdi] + movdqa xmm10, xmm1 + add rdi, 16 + pxor xmm1, [r8] + movdqu xmm2, 160[r8] + cmp r9d, 12 + aesdec xmm1, 16[r8] + aesdec xmm1, 32[r8] + aesdec xmm1, 48[r8] + aesdec xmm1, 64[r8] + aesdec xmm1, 80[r8] + aesdec xmm1, 96[r8] + aesdec xmm1, 112[r8] + aesdec xmm1, 128[r8] + aesdec xmm1, 144[r8] + jb DLAST_6_2 + movdqu xmm2, 192[r8] + cmp r9d, 14 + aesdec xmm1, 160[r8] + aesdec xmm1, 176[r8] + jb DLAST_6_2 + movdqu xmm2, 224[r8] + aesdec xmm1, 192[r8] + aesdec xmm1, 208[r8] +DLAST_6_2: + aesdeclast xmm1, xmm2 + pxor xmm1, xmm7 + movdqa xmm7, xmm10 + movdqu [rsi], xmm1 + add rsi, 16 + dec r10 + jne DLOOP_6_2 +DEND_6: + ; restore non volatile rdi,rsi + mov rdi, rax + mov rsi, r11 + ; restore non volatile xmms from stack + movdqa xmm6, [rsp+0] + movdqa xmm7, [rsp+16] + movdqa xmm8, [rsp+32] + movdqa xmm9, [rsp+48] + movdqa xmm10, [rsp+64] + movdqa xmm11, [rsp+80] + movdqa xmm12, [rsp+96] + movdqa xmm13, [rsp+112] + movdqa xmm14, [rsp+128] + add rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each + ret +AES_CBC_decrypt_by6 ENDP + + +; void AES_CBC_decrypt_by8(const unsigned char *in, +; unsigned char *out, +; unsigned char ivec[16], +; unsigned long length, +; const unsigned char *KS, +; int nr) +AES_CBC_decrypt_by8 PROC +; parameter 1: rdi - in +; parameter 2: rsi - out +; parameter 3: rdx - ivec +; parameter 4: rcx - length +; parameter 5: r8 - KS +; parameter 6: r9d - nr + + ; save rdi and rsi to rax and r11, restore before ret + mov rax, rdi + mov r11, rsi + ; convert to what we had for att&t convention + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx,r9 + mov r8, [rsp+40] + mov r9d, [rsp+48] + ; on microsoft xmm6-xmm15 are non volatile, + ; let's save on stack and restore at end + sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each + movdqa [rsp+0], xmm6 + movdqa [rsp+16], xmm7 + movdqa [rsp+32], xmm8 + movdqa [rsp+48], xmm9 + movdqa [rsp+64], xmm10 + movdqa [rsp+80], xmm11 + movdqa [rsp+96], xmm12 + movdqa [rsp+112], xmm13 + ; back to our original code, more or less + mov r10, rcx + shr rcx, 4 + shl r10, 60 + je DNO_PARTS_8 + add rcx, 1 +DNO_PARTS_8: + mov r10, rcx + shl r10, 61 + shr r10, 61 + shr rcx, 3 + movdqu xmm9, [rdx] + je DREMAINDER_8 + sub rsi, 128 +DLOOP_8: + movdqu xmm1, [rdi] + movdqu xmm2, 16[rdi] + movdqu xmm3, 32[rdi] + movdqu xmm4, 48[rdi] + movdqu xmm5, 64[rdi] + movdqu xmm6, 80[rdi] + movdqu xmm7, 96[rdi] + movdqu xmm8, 112[rdi] + movdqa xmm10, [r8] + movdqa xmm11, 16[r8] + movdqa xmm12, 32[r8] + movdqa xmm13, 48[r8] + pxor xmm1, xmm10 + pxor xmm2, xmm10 + pxor xmm3, xmm10 + pxor xmm4, xmm10 + pxor xmm5, xmm10 + pxor xmm6, xmm10 + pxor xmm7, xmm10 + pxor xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 + aesdec xmm1, xmm12 + aesdec xmm2, xmm12 + aesdec xmm3, xmm12 + aesdec xmm4, xmm12 + aesdec xmm5, xmm12 + aesdec xmm6, xmm12 + aesdec xmm7, xmm12 + aesdec xmm8, xmm12 + aesdec xmm1, xmm13 + aesdec xmm2, xmm13 + aesdec xmm3, xmm13 + aesdec xmm4, xmm13 + aesdec xmm5, xmm13 + aesdec xmm6, xmm13 + aesdec xmm7, xmm13 + aesdec xmm8, xmm13 + movdqa xmm10, 64[r8] + movdqa xmm11, 80[r8] + movdqa xmm12, 96[r8] + movdqa xmm13, 112[r8] + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm7, xmm10 + aesdec xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 + aesdec xmm1, xmm12 + aesdec xmm2, xmm12 + aesdec xmm3, xmm12 + aesdec xmm4, xmm12 + aesdec xmm5, xmm12 + aesdec xmm6, xmm12 + aesdec xmm7, xmm12 + aesdec xmm8, xmm12 + aesdec xmm1, xmm13 + aesdec xmm2, xmm13 + aesdec xmm3, xmm13 + aesdec xmm4, xmm13 + aesdec xmm5, xmm13 + aesdec xmm6, xmm13 + aesdec xmm7, xmm13 + aesdec xmm8, xmm13 + movdqa xmm10, 128[r8] + movdqa xmm11, 144[r8] + movdqa xmm12, 160[r8] + cmp r9d, 12 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm7, xmm10 + aesdec xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 + jb DLAST_8 + movdqa xmm10, 160[r8] + movdqa xmm11, 176[r8] + movdqa xmm12, 192[r8] + cmp r9d, 14 + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm7, xmm10 + aesdec xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 + jb DLAST_8 + movdqa xmm10, 192[r8] + movdqa xmm11, 208[r8] + movdqa xmm12, 224[r8] + aesdec xmm1, xmm10 + aesdec xmm2, xmm10 + aesdec xmm3, xmm10 + aesdec xmm4, xmm10 + aesdec xmm5, xmm10 + aesdec xmm6, xmm10 + aesdec xmm7, xmm10 + aesdec xmm8, xmm10 + aesdec xmm1, xmm11 + aesdec xmm2, xmm11 + aesdec xmm3, xmm11 + aesdec xmm4, xmm11 + aesdec xmm5, xmm11 + aesdec xmm6, xmm11 + aesdec xmm7, xmm11 + aesdec xmm8, xmm11 +DLAST_8: + add rsi, 128 + aesdeclast xmm1, xmm12 + aesdeclast xmm2, xmm12 + aesdeclast xmm3, xmm12 + aesdeclast xmm4, xmm12 + aesdeclast xmm5, xmm12 + aesdeclast xmm6, xmm12 + aesdeclast xmm7, xmm12 + aesdeclast xmm8, xmm12 + movdqu xmm10, [rdi] + movdqu xmm11, 16[rdi] + movdqu xmm12, 32[rdi] + movdqu xmm13, 48[rdi] + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + movdqu xmm10, 64[rdi] + movdqu xmm11, 80[rdi] + movdqu xmm12, 96[rdi] + movdqu xmm9, 112[rdi] + pxor xmm6, xmm10 + pxor xmm7, xmm11 + pxor xmm8, xmm12 + movdqu [rsi], xmm1 + movdqu 16[rsi], xmm2 + movdqu 32[rsi], xmm3 + movdqu 48[rsi], xmm4 + movdqu 64[rsi], xmm5 + movdqu 80[rsi], xmm6 + movdqu 96[rsi], xmm7 + movdqu 112[rsi], xmm8 + add rdi, 128 + dec rcx + jne DLOOP_8 + add rsi, 128 +DREMAINDER_8: + cmp r10, 0 + je DEND_8 +DLOOP_8_2: + movdqu xmm1, [rdi] + movdqa xmm10, xmm1 + add rdi, 16 + pxor xmm1, [r8] + movdqu xmm2, 160[r8] + cmp r9d, 12 + aesdec xmm1, 16[r8] + aesdec xmm1, 32[r8] + aesdec xmm1, 48[r8] + aesdec xmm1, 64[r8] + aesdec xmm1, 80[r8] + aesdec xmm1, 96[r8] + aesdec xmm1, 112[r8] + aesdec xmm1, 128[r8] + aesdec xmm1, 144[r8] + jb DLAST_8_2 + movdqu xmm2, 192[r8] + cmp r9d, 14 + aesdec xmm1, 160[r8] + aesdec xmm1, 176[r8] + jb DLAST_8_2 + movdqu xmm2, 224[r8] + aesdec xmm1, 192[r8] + aesdec xmm1, 208[r8] +DLAST_8_2: + aesdeclast xmm1, xmm2 + pxor xmm1, xmm9 + movdqa xmm9, xmm10 + movdqu [rsi], xmm1 + add rsi, 16 + dec r10 + jne DLOOP_8_2 +DEND_8: + ; restore non volatile rdi,rsi + mov rdi, rax + mov rsi, r11 + ; restore non volatile xmms from stack + movdqa xmm6, [rsp+0] + movdqa xmm7, [rsp+16] + movdqa xmm8, [rsp+32] + movdqa xmm9, [rsp+48] + movdqa xmm10, [rsp+64] + movdqa xmm11, [rsp+80] + movdqa xmm12, [rsp+96] + movdqa xmm13, [rsp+112] + add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each + ret +AES_CBC_decrypt_by8 ENDP + ; /* ; AES_ECB_encrypt[const ,unsigned char*in diff --git a/wolfcrypt/src/aes_asm.s b/wolfcrypt/src/aes_asm.s index 385a2d49a..ac67a09ee 100644 --- a/wolfcrypt/src/aes_asm.s +++ b/wolfcrypt/src/aes_asm.s @@ -86,18 +86,18 @@ jne LOOP ret - +#if defined(WOLFSSL_AESNI_BY4) /* -AES_CBC_decrypt (const unsigned char *in, +AES_CBC_decrypt_by4 (const unsigned char *in, unsigned char *out, unsigned char ivec[16], unsigned long length, const unsigned char *KS, int nr) */ -.globl AES_CBC_decrypt -AES_CBC_decrypt: +.globl AES_CBC_decrypt_by4 +AES_CBC_decrypt_by4: # parameter 1: %rdi # parameter 2: %rsi # parameter 3: %rdx @@ -105,165 +105,638 @@ AES_CBC_decrypt: # parameter 5: %r8 # parameter 6: %r9d -movq %rcx, %r10 -shrq $4, %rcx -shlq $60, %r10 -je DNO_PARTS_4 -addq $1, %rcx + movq %rcx, %r10 + shrq $4, %rcx + shlq $60, %r10 + je DNO_PARTS_4 + addq $1, %rcx DNO_PARTS_4: -movq %rcx, %r10 -shlq $62, %r10 -shrq $62, %r10 -shrq $2, %rcx -movdqu (%rdx),%xmm5 -je DREMAINDER_4 -subq $64, %rsi + movq %rcx, %r10 + shlq $62, %r10 + shrq $62, %r10 + shrq $2, %rcx + movdqu (%rdx),%xmm5 + je DREMAINDER_4 + subq $64, %rsi DLOOP_4: -movdqu (%rdi), %xmm1 -movdqu 16(%rdi), %xmm2 -movdqu 32(%rdi), %xmm3 -movdqu 48(%rdi), %xmm4 -movdqa %xmm1, %xmm6 -movdqa %xmm2, %xmm7 -movdqa %xmm3, %xmm8 -movdqa %xmm4, %xmm15 -movdqa (%r8), %xmm9 -movdqa 16(%r8), %xmm10 -movdqa 32(%r8), %xmm11 -movdqa 48(%r8), %xmm12 -pxor %xmm9, %xmm1 -pxor %xmm9, %xmm2 -pxor %xmm9, %xmm3 - -pxor %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 -aesdec %xmm11, %xmm1 -aesdec %xmm11, %xmm2 -aesdec %xmm11, %xmm3 -aesdec %xmm11, %xmm4 -aesdec %xmm12, %xmm1 -aesdec %xmm12, %xmm2 -aesdec %xmm12, %xmm3 -aesdec %xmm12, %xmm4 -movdqa 64(%r8), %xmm9 -movdqa 80(%r8), %xmm10 -movdqa 96(%r8), %xmm11 -movdqa 112(%r8), %xmm12 -aesdec %xmm9, %xmm1 -aesdec %xmm9, %xmm2 -aesdec %xmm9, %xmm3 -aesdec %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 -aesdec %xmm11, %xmm1 -aesdec %xmm11, %xmm2 -aesdec %xmm11, %xmm3 -aesdec %xmm11, %xmm4 -aesdec %xmm12, %xmm1 -aesdec %xmm12, %xmm2 -aesdec %xmm12, %xmm3 -aesdec %xmm12, %xmm4 -movdqa 128(%r8), %xmm9 -movdqa 144(%r8), %xmm10 -movdqa 160(%r8), %xmm11 -cmpl $12, %r9d -aesdec %xmm9, %xmm1 -aesdec %xmm9, %xmm2 -aesdec %xmm9, %xmm3 -aesdec %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 -jb DLAST_4 -movdqa 160(%r8), %xmm9 -movdqa 176(%r8), %xmm10 -movdqa 192(%r8), %xmm11 -cmpl $14, %r9d -aesdec %xmm9, %xmm1 -aesdec %xmm9, %xmm2 -aesdec %xmm9, %xmm3 -aesdec %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 -jb DLAST_4 - -movdqa 192(%r8), %xmm9 -movdqa 208(%r8), %xmm10 -movdqa 224(%r8), %xmm11 -aesdec %xmm9, %xmm1 -aesdec %xmm9, %xmm2 -aesdec %xmm9, %xmm3 -aesdec %xmm9, %xmm4 -aesdec %xmm10, %xmm1 -aesdec %xmm10, %xmm2 -aesdec %xmm10, %xmm3 -aesdec %xmm10, %xmm4 + movdqu (%rdi), %xmm1 + movdqu 16(%rdi), %xmm2 + movdqu 32(%rdi), %xmm3 + movdqu 48(%rdi), %xmm4 + movdqa %xmm1, %xmm6 + movdqa %xmm2, %xmm7 + movdqa %xmm3, %xmm8 + movdqa %xmm4, %xmm15 + movdqa (%r8), %xmm9 + movdqa 16(%r8), %xmm10 + movdqa 32(%r8), %xmm11 + movdqa 48(%r8), %xmm12 + pxor %xmm9, %xmm1 + pxor %xmm9, %xmm2 + pxor %xmm9, %xmm3 + pxor %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm3 + aesdec %xmm12, %xmm4 + movdqa 64(%r8), %xmm9 + movdqa 80(%r8), %xmm10 + movdqa 96(%r8), %xmm11 + movdqa 112(%r8), %xmm12 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm3 + aesdec %xmm12, %xmm4 + movdqa 128(%r8), %xmm9 + movdqa 144(%r8), %xmm10 + movdqa 160(%r8), %xmm11 + cmpl $12, %r9d + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + jb DLAST_4 + movdqa 160(%r8), %xmm9 + movdqa 176(%r8), %xmm10 + movdqa 192(%r8), %xmm11 + cmpl $14, %r9d + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + jb DLAST_4 + movdqa 192(%r8), %xmm9 + movdqa 208(%r8), %xmm10 + movdqa 224(%r8), %xmm11 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 DLAST_4: -addq $64, %rdi -addq $64, %rsi -decq %rcx -aesdeclast %xmm11, %xmm1 -aesdeclast %xmm11, %xmm2 -aesdeclast %xmm11, %xmm3 -aesdeclast %xmm11, %xmm4 -pxor %xmm5 ,%xmm1 -pxor %xmm6 ,%xmm2 -pxor %xmm7 ,%xmm3 -pxor %xmm8 ,%xmm4 -movdqu %xmm1, (%rsi) -movdqu %xmm2, 16(%rsi) -movdqu %xmm3, 32(%rsi) -movdqu %xmm4, 48(%rsi) -movdqa %xmm15,%xmm5 -jne DLOOP_4 -addq $64, %rsi + addq $64, %rdi + addq $64, %rsi + decq %rcx + aesdeclast %xmm11, %xmm1 + aesdeclast %xmm11, %xmm2 + aesdeclast %xmm11, %xmm3 + aesdeclast %xmm11, %xmm4 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + pxor %xmm8, %xmm4 + movdqu %xmm1, (%rsi) + movdqu %xmm2, 16(%rsi) + movdqu %xmm3, 32(%rsi) + movdqu %xmm4, 48(%rsi) + movdqa %xmm15,%xmm5 + jne DLOOP_4 + addq $64, %rsi DREMAINDER_4: -cmpq $0, %r10 -je DEND_4 + cmpq $0, %r10 + je DEND_4 DLOOP_4_2: -movdqu (%rdi), %xmm1 -movdqa %xmm1 ,%xmm15 -addq $16, %rdi -pxor (%r8), %xmm1 -movdqu 160(%r8), %xmm2 -cmpl $12, %r9d -aesdec 16(%r8), %xmm1 -aesdec 32(%r8), %xmm1 -aesdec 48(%r8), %xmm1 -aesdec 64(%r8), %xmm1 -aesdec 80(%r8), %xmm1 -aesdec 96(%r8), %xmm1 -aesdec 112(%r8), %xmm1 -aesdec 128(%r8), %xmm1 -aesdec 144(%r8), %xmm1 -jb DLAST_4_2 -movdqu 192(%r8), %xmm2 -cmpl $14, %r9d -aesdec 160(%r8), %xmm1 -aesdec 176(%r8), %xmm1 -jb DLAST_4_2 -movdqu 224(%r8), %xmm2 -aesdec 192(%r8), %xmm1 -aesdec 208(%r8), %xmm1 + movdqu (%rdi), %xmm1 + movdqa %xmm1, %xmm15 + addq $16, %rdi + pxor (%r8), %xmm1 + movdqu 160(%r8), %xmm2 + cmpl $12, %r9d + aesdec 16(%r8), %xmm1 + aesdec 32(%r8), %xmm1 + aesdec 48(%r8), %xmm1 + aesdec 64(%r8), %xmm1 + aesdec 80(%r8), %xmm1 + aesdec 96(%r8), %xmm1 + aesdec 112(%r8), %xmm1 + aesdec 128(%r8), %xmm1 + aesdec 144(%r8), %xmm1 + jb DLAST_4_2 + movdqu 192(%r8), %xmm2 + cmpl $14, %r9d + aesdec 160(%r8), %xmm1 + aesdec 176(%r8), %xmm1 + jb DLAST_4_2 + movdqu 224(%r8), %xmm2 + aesdec 192(%r8), %xmm1 + aesdec 208(%r8), %xmm1 DLAST_4_2: -aesdeclast %xmm2, %xmm1 -pxor %xmm5, %xmm1 -movdqa %xmm15, %xmm5 -movdqu %xmm1, (%rsi) - -addq $16, %rsi -decq %r10 -jne DLOOP_4_2 + aesdeclast %xmm2, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm15, %xmm5 + movdqu %xmm1, (%rsi) + addq $16, %rsi + decq %r10 + jne DLOOP_4_2 DEND_4: -ret + ret + +#elif defined(WOLFSSL_AESNI_BY6) + +/* +AES_CBC_decrypt_by6 (const unsigned char *in, + unsigned char *out, + unsigned char ivec[16], + unsigned long length, + const unsigned char *KS, + int nr) +*/ +.globl AES_CBC_decrypt_by6 +AES_CBC_decrypt_by6: +# parameter 1: %rdi - in +# parameter 2: %rsi - out +# parameter 3: %rdx - ivec +# parameter 4: %rcx - length +# parameter 5: %r8 - KS +# parameter 6: %r9d - nr + + movq %rcx, %r10 + shrq $4, %rcx + shlq $60, %r10 + je DNO_PARTS_6 + addq $1, %rcx +DNO_PARTS_6: + movq %rax, %r12 + movq %rdx, %r13 + movq %rbx, %r14 + movq $0, %rdx + movq %rcx, %rax + movq $6, %rbx + div %rbx + movq %rax, %rcx + movq %rdx, %r10 + movq %r12, %rax + movq %r13, %rdx + movq %r14, %rbx + cmpq $0, %rcx + movdqu (%rdx), %xmm7 + je DREMAINDER_6 + subq $96, %rsi +DLOOP_6: + movdqu (%rdi), %xmm1 + movdqu 16(%rdi), %xmm2 + movdqu 32(%rdi), %xmm3 + movdqu 48(%rdi), %xmm4 + movdqu 64(%rdi), %xmm5 + movdqu 80(%rdi), %xmm6 + movdqa (%r8), %xmm8 + movdqa 16(%r8), %xmm9 + movdqa 32(%r8), %xmm10 + movdqa 48(%r8), %xmm11 + pxor %xmm8, %xmm1 + pxor %xmm8, %xmm2 + pxor %xmm8, %xmm3 + pxor %xmm8, %xmm4 + pxor %xmm8, %xmm5 + pxor %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + movdqa 64(%r8), %xmm8 + movdqa 80(%r8), %xmm9 + movdqa 96(%r8), %xmm10 + movdqa 112(%r8), %xmm11 + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm3 + aesdec %xmm8, %xmm4 + aesdec %xmm8, %xmm5 + aesdec %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + movdqa 128(%r8), %xmm8 + movdqa 144(%r8), %xmm9 + movdqa 160(%r8), %xmm10 + cmpl $12, %r9d + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm3 + aesdec %xmm8, %xmm4 + aesdec %xmm8, %xmm5 + aesdec %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 + jb DLAST_6 + movdqa 160(%r8), %xmm8 + movdqa 176(%r8), %xmm9 + movdqa 192(%r8), %xmm10 + cmpl $14, %r9d + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm3 + aesdec %xmm8, %xmm4 + aesdec %xmm8, %xmm5 + aesdec %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 + jb DLAST_6 + movdqa 192(%r8), %xmm8 + movdqa 208(%r8), %xmm9 + movdqa 224(%r8), %xmm10 + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm3 + aesdec %xmm8, %xmm4 + aesdec %xmm8, %xmm5 + aesdec %xmm8, %xmm6 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm3 + aesdec %xmm9, %xmm4 + aesdec %xmm9, %xmm5 + aesdec %xmm9, %xmm6 +DLAST_6: + addq $96, %rsi + aesdeclast %xmm10, %xmm1 + aesdeclast %xmm10, %xmm2 + aesdeclast %xmm10, %xmm3 + aesdeclast %xmm10, %xmm4 + aesdeclast %xmm10, %xmm5 + aesdeclast %xmm10, %xmm6 + movdqu (%rdi), %xmm8 + movdqu 16(%rdi), %xmm9 + movdqu 32(%rdi), %xmm10 + movdqu 48(%rdi), %xmm11 + movdqu 64(%rdi), %xmm12 + movdqu 80(%rdi), %xmm13 + pxor %xmm7, %xmm1 + pxor %xmm8, %xmm2 + pxor %xmm9, %xmm3 + pxor %xmm10, %xmm4 + pxor %xmm11, %xmm5 + pxor %xmm12, %xmm6 + movdqu %xmm13, %xmm7 + movdqu %xmm1, (%rsi) + movdqu %xmm2, 16(%rsi) + movdqu %xmm3, 32(%rsi) + movdqu %xmm4, 48(%rsi) + movdqu %xmm5, 64(%rsi) + movdqu %xmm6, 80(%rsi) + addq $96, %rdi + decq %rcx + jne DLOOP_6 + addq $96, %rsi +DREMAINDER_6: + cmpq $0, %r10 + je DEND_6 +DLOOP_6_2: + movdqu (%rdi), %xmm1 + movdqa %xmm1, %xmm10 + addq $16, %rdi + pxor (%r8), %xmm1 + movdqu 160(%r8), %xmm2 + cmpl $12, %r9d + aesdec 16(%r8), %xmm1 + aesdec 32(%r8), %xmm1 + aesdec 48(%r8), %xmm1 + aesdec 64(%r8), %xmm1 + aesdec 80(%r8), %xmm1 + aesdec 96(%r8), %xmm1 + aesdec 112(%r8), %xmm1 + aesdec 128(%r8), %xmm1 + aesdec 144(%r8), %xmm1 + jb DLAST_6_2 + movdqu 192(%r8), %xmm2 + cmpl $14, %r9d + aesdec 160(%r8), %xmm1 + aesdec 176(%r8), %xmm1 + jb DLAST_6_2 + movdqu 224(%r8), %xmm2 + aesdec 192(%r8), %xmm1 + aesdec 208(%r8), %xmm1 +DLAST_6_2: + aesdeclast %xmm2, %xmm1 + pxor %xmm7, %xmm1 + movdqa %xmm10, %xmm7 + movdqu %xmm1, (%rsi) + addq $16, %rsi + decq %r10 + jne DLOOP_6_2 +DEND_6: + ret + +#else /* WOLFSSL_AESNI_BYx */ + +/* +AES_CBC_decrypt_by8 (const unsigned char *in, + unsigned char *out, + unsigned char ivec[16], + unsigned long length, + const unsigned char *KS, + int nr) +*/ +.globl AES_CBC_decrypt_by8 +AES_CBC_decrypt_by8: +# parameter 1: %rdi - in +# parameter 2: %rsi - out +# parameter 3: %rdx - ivec +# parameter 4: %rcx - length +# parameter 5: %r8 - KS +# parameter 6: %r9d - nr + + movq %rcx, %r10 + shrq $4, %rcx + shlq $60, %r10 + je DNO_PARTS_8 + addq $1, %rcx +DNO_PARTS_8: + movq %rcx, %r10 + shlq $61, %r10 + shrq $61, %r10 + shrq $3, %rcx + movdqu (%rdx), %xmm9 + je DREMAINDER_8 + subq $128, %rsi +DLOOP_8: + movdqu (%rdi), %xmm1 + movdqu 16(%rdi), %xmm2 + movdqu 32(%rdi), %xmm3 + movdqu 48(%rdi), %xmm4 + movdqu 64(%rdi), %xmm5 + movdqu 80(%rdi), %xmm6 + movdqu 96(%rdi), %xmm7 + movdqu 112(%rdi), %xmm8 + movdqa (%r8), %xmm10 + movdqa 16(%r8), %xmm11 + movdqa 32(%r8), %xmm12 + movdqa 48(%r8), %xmm13 + pxor %xmm10, %xmm1 + pxor %xmm10, %xmm2 + pxor %xmm10, %xmm3 + pxor %xmm10, %xmm4 + pxor %xmm10, %xmm5 + pxor %xmm10, %xmm6 + pxor %xmm10, %xmm7 + pxor %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm3 + aesdec %xmm12, %xmm4 + aesdec %xmm12, %xmm5 + aesdec %xmm12, %xmm6 + aesdec %xmm12, %xmm7 + aesdec %xmm12, %xmm8 + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm3 + aesdec %xmm13, %xmm4 + aesdec %xmm13, %xmm5 + aesdec %xmm13, %xmm6 + aesdec %xmm13, %xmm7 + aesdec %xmm13, %xmm8 + movdqa 64(%r8), %xmm10 + movdqa 80(%r8), %xmm11 + movdqa 96(%r8), %xmm12 + movdqa 112(%r8), %xmm13 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm10, %xmm7 + aesdec %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm3 + aesdec %xmm12, %xmm4 + aesdec %xmm12, %xmm5 + aesdec %xmm12, %xmm6 + aesdec %xmm12, %xmm7 + aesdec %xmm12, %xmm8 + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm3 + aesdec %xmm13, %xmm4 + aesdec %xmm13, %xmm5 + aesdec %xmm13, %xmm6 + aesdec %xmm13, %xmm7 + aesdec %xmm13, %xmm8 + movdqa 128(%r8), %xmm10 + movdqa 144(%r8), %xmm11 + movdqa 160(%r8), %xmm12 + cmpl $12, %r9d + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm10, %xmm7 + aesdec %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 + jb DLAST_8 + movdqa 160(%r8), %xmm10 + movdqa 176(%r8), %xmm11 + movdqa 192(%r8), %xmm12 + cmpl $14, %r9d + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm10, %xmm7 + aesdec %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 + jb DLAST_8 + movdqa 192(%r8), %xmm10 + movdqa 208(%r8), %xmm11 + movdqa 224(%r8), %xmm12 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm3 + aesdec %xmm10, %xmm4 + aesdec %xmm10, %xmm5 + aesdec %xmm10, %xmm6 + aesdec %xmm10, %xmm7 + aesdec %xmm10, %xmm8 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm3 + aesdec %xmm11, %xmm4 + aesdec %xmm11, %xmm5 + aesdec %xmm11, %xmm6 + aesdec %xmm11, %xmm7 + aesdec %xmm11, %xmm8 +DLAST_8: + addq $128, %rsi + aesdeclast %xmm12, %xmm1 + aesdeclast %xmm12, %xmm2 + aesdeclast %xmm12, %xmm3 + aesdeclast %xmm12, %xmm4 + aesdeclast %xmm12, %xmm5 + aesdeclast %xmm12, %xmm6 + aesdeclast %xmm12, %xmm7 + aesdeclast %xmm12, %xmm8 + movdqu (%rdi), %xmm10 + movdqu 16(%rdi), %xmm11 + movdqu 32(%rdi), %xmm12 + movdqu 48(%rdi), %xmm13 + pxor %xmm9, %xmm1 + pxor %xmm10, %xmm2 + pxor %xmm11, %xmm3 + pxor %xmm12, %xmm4 + pxor %xmm13, %xmm5 + movdqu 64(%rdi), %xmm10 + movdqu 80(%rdi), %xmm11 + movdqu 96(%rdi), %xmm12 + movdqu 112(%rdi), %xmm9 + pxor %xmm10, %xmm6 + pxor %xmm11, %xmm7 + pxor %xmm12, %xmm8 + movdqu %xmm1, (%rsi) + movdqu %xmm2, 16(%rsi) + movdqu %xmm3, 32(%rsi) + movdqu %xmm4, 48(%rsi) + movdqu %xmm5, 64(%rsi) + movdqu %xmm6, 80(%rsi) + movdqu %xmm7, 96(%rsi) + movdqu %xmm8, 112(%rsi) + addq $128, %rdi + decq %rcx + jne DLOOP_8 + addq $128, %rsi +DREMAINDER_8: + cmpq $0, %r10 + je DEND_8 +DLOOP_8_2: + movdqu (%rdi), %xmm1 + movdqa %xmm1, %xmm10 + addq $16, %rdi + pxor (%r8), %xmm1 + movdqu 160(%r8), %xmm2 + cmpl $12, %r9d + aesdec 16(%r8), %xmm1 + aesdec 32(%r8), %xmm1 + aesdec 48(%r8), %xmm1 + aesdec 64(%r8), %xmm1 + aesdec 80(%r8), %xmm1 + aesdec 96(%r8), %xmm1 + aesdec 112(%r8), %xmm1 + aesdec 128(%r8), %xmm1 + aesdec 144(%r8), %xmm1 + jb DLAST_8_2 + movdqu 192(%r8), %xmm2 + cmpl $14, %r9d + aesdec 160(%r8), %xmm1 + aesdec 176(%r8), %xmm1 + jb DLAST_8_2 + movdqu 224(%r8), %xmm2 + aesdec 192(%r8), %xmm1 + aesdec 208(%r8), %xmm1 +DLAST_8_2: + aesdeclast %xmm2, %xmm1 + pxor %xmm9, %xmm1 + movdqa %xmm10, %xmm9 + movdqu %xmm1, (%rsi) + addq $16, %rsi + decq %r10 + jne DLOOP_8_2 +DEND_8: + ret + +#endif /* WOLFSSL_AESNI_BYx */ /* diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 6930ed96c..9e831323c 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -2669,6 +2669,94 @@ int aes_test(void) if (memcmp(cipher, verify, AES_BLOCK_SIZE)) return -61; +#if defined(WOLFSSL_AESNI) && defined(HAVE_AES_DECRYPT) + { + const byte bigMsg[] = { + /* "All work and no play makes Jack a dull boy. " */ + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20, + 0x61,0x20,0x64,0x75,0x6c,0x6c,0x20,0x62, + 0x6f,0x79,0x2e,0x20,0x41,0x6c,0x6c,0x20, + 0x77,0x6f,0x72,0x6b,0x20,0x61,0x6e,0x64, + 0x20,0x6e,0x6f,0x20,0x70,0x6c,0x61,0x79, + 0x20,0x6d,0x61,0x6b,0x65,0x73,0x20,0x4a, + 0x61,0x63,0x6b,0x20,0x61,0x20,0x64,0x75, + 0x6c,0x6c,0x20,0x62,0x6f,0x79,0x2e,0x20, + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20, + 0x61,0x20,0x64,0x75,0x6c,0x6c,0x20,0x62, + 0x6f,0x79,0x2e,0x20,0x41,0x6c,0x6c,0x20, + 0x77,0x6f,0x72,0x6b,0x20,0x61,0x6e,0x64, + 0x20,0x6e,0x6f,0x20,0x70,0x6c,0x61,0x79, + 0x20,0x6d,0x61,0x6b,0x65,0x73,0x20,0x4a, + 0x61,0x63,0x6b,0x20,0x61,0x20,0x64,0x75, + 0x6c,0x6c,0x20,0x62,0x6f,0x79,0x2e,0x20, + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20, + 0x61,0x20,0x64,0x75,0x6c,0x6c,0x20,0x62, + 0x6f,0x79,0x2e,0x20,0x41,0x6c,0x6c,0x20, + 0x77,0x6f,0x72,0x6b,0x20,0x61,0x6e,0x64, + 0x20,0x6e,0x6f,0x20,0x70,0x6c,0x61,0x79, + 0x20,0x6d,0x61,0x6b,0x65,0x73,0x20,0x4a, + 0x61,0x63,0x6b,0x20,0x61,0x20,0x64,0x75, + 0x6c,0x6c,0x20,0x62,0x6f,0x79,0x2e,0x20, + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20, + 0x61,0x20,0x64,0x75,0x6c,0x6c,0x20,0x62, + 0x6f,0x79,0x2e,0x20,0x41,0x6c,0x6c,0x20, + 0x77,0x6f,0x72,0x6b,0x20,0x61,0x6e,0x64, + 0x20,0x6e,0x6f,0x20,0x70,0x6c,0x61,0x79, + 0x20,0x6d,0x61,0x6b,0x65,0x73,0x20,0x4a, + 0x61,0x63,0x6b,0x20,0x61,0x20,0x64,0x75, + 0x6c,0x6c,0x20,0x62,0x6f,0x79,0x2e,0x20, + 0x41,0x6c,0x6c,0x20,0x77,0x6f,0x72,0x6b, + 0x20,0x61,0x6e,0x64,0x20,0x6e,0x6f,0x20, + 0x70,0x6c,0x61,0x79,0x20,0x6d,0x61,0x6b, + 0x65,0x73,0x20,0x4a,0x61,0x63,0x6b,0x20 + }; + const byte bigKey[] = "0123456789abcdeffedcba9876543210"; + byte bigCipher[sizeof(bigMsg)]; + byte bigPlain[sizeof(bigMsg)]; + word32 keySz, msgSz; + + /* Iterate from one AES_BLOCK_SIZE of bigMsg through the whole + * message by AES_BLOCK_SIZE for each size of AES key. */ + for (keySz = 16; keySz <= 32; keySz += 8) { + for (msgSz = AES_BLOCK_SIZE; + msgSz <= sizeof(bigMsg); + msgSz += AES_BLOCK_SIZE) { + + memset(bigCipher, 0, sizeof(bigCipher)); + memset(bigPlain, 0, sizeof(bigPlain)); + ret = wc_AesSetKey(&enc, bigKey, keySz, iv, AES_ENCRYPTION); + if (ret != 0) + return -1030; + ret = wc_AesSetKey(&dec, bigKey, keySz, iv, AES_DECRYPTION); + if (ret != 0) + return -1031; + + ret = wc_AesCbcEncrypt(&enc, bigCipher, bigMsg, msgSz); + if (ret != 0) + return -1032; + ret = wc_AesCbcDecrypt(&dec, bigPlain, bigCipher, msgSz); + if (ret != 0) + return -1033; + + if (memcmp(bigPlain, bigMsg, msgSz)) + return -1034; + } + } + } +#endif /* WOLFSSL_AESNI HAVE_AES_DECRYPT */ + #ifdef HAVE_CAVIUM wc_AesFreeCavium(&enc); wc_AesFreeCavium(&dec);