From cfac603ee1ea0d8365e52998520aa0c5f0d9a140 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 14 Jul 2023 10:48:32 +1000 Subject: [PATCH] AES-GCM x86_64 MSVC ASM: XMM6-15 are non-volatile Put XMM6-15, when used, on the stack at start of function and restore at end of function. --- wolfcrypt/src/aes_gcm_asm.S | 898 +++++++++++----------- wolfcrypt/src/aes_gcm_asm.asm | 1350 +++++++++++++++++++++------------ 2 files changed, 1308 insertions(+), 940 deletions(-) diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index 82db934af..629c72b8c 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -3472,53 +3472,53 @@ _AES_GCM_init_aesni: movdqa %xmm4, %xmm1 movdqa (%rdi), %xmm5 pxor %xmm5, %xmm1 - movdqa 16(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 32(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 48(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 64(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 80(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 96(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 112(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 128(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 144(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 + movdqa 16(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 32(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 48(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 64(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 80(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 96(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 112(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 128(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 144(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 cmpl $11, %esi - movdqa 160(%rdi), %xmm7 + movdqa 160(%rdi), %xmm6 jl L_AES_GCM_init_aesni_calc_iv_12_last - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 176(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 176(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 cmpl $13, %esi - movdqa 192(%rdi), %xmm7 + movdqa 192(%rdi), %xmm6 jl L_AES_GCM_init_aesni_calc_iv_12_last - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 208(%rdi), %xmm7 - aesenc %xmm7, %xmm5 - aesenc %xmm7, %xmm1 - movdqa 224(%rdi), %xmm7 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 208(%rdi), %xmm6 + aesenc %xmm6, %xmm5 + aesenc %xmm6, %xmm1 + movdqa 224(%rdi), %xmm6 L_AES_GCM_init_aesni_calc_iv_12_last: - aesenclast %xmm7, %xmm5 - aesenclast %xmm7, %xmm1 + aesenclast %xmm6, %xmm5 + aesenclast %xmm6, %xmm1 pshufb L_aes_gcm_bswap_mask(%rip), %xmm5 movdqu %xmm1, %xmm15 jmp L_AES_GCM_init_aesni_iv_done @@ -3536,18 +3536,18 @@ L_AES_GCM_init_aesni_iv_not_12: aesenc 128(%rdi), %xmm5 aesenc 144(%rdi), %xmm5 cmpl $11, %esi - movdqa 160(%rdi), %xmm9 + movdqa 160(%rdi), %xmm8 jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last - aesenc %xmm9, %xmm5 + aesenc %xmm8, %xmm5 aesenc 176(%rdi), %xmm5 cmpl $13, %esi - movdqa 192(%rdi), %xmm9 + movdqa 192(%rdi), %xmm8 jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last - aesenc %xmm9, %xmm5 + aesenc %xmm8, %xmm5 aesenc 208(%rdi), %xmm5 - movdqa 224(%rdi), %xmm9 + movdqa 224(%rdi), %xmm8 L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last: - aesenclast %xmm9, %xmm5 + aesenclast %xmm8, %xmm5 pshufb L_aes_gcm_bswap_mask(%rip), %xmm5 # Calc counter # Initialization vector @@ -3558,9 +3558,9 @@ L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last: jl L_AES_GCM_init_aesni_calc_iv_lt16 andl $0xfffffff0, %edx L_AES_GCM_init_aesni_calc_iv_16_loop: - movdqu (%r10,%rcx,1), %xmm8 - pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 - pxor %xmm8, %xmm4 + movdqu (%r10,%rcx,1), %xmm7 + pshufb L_aes_gcm_bswap_mask(%rip), %xmm7 + pxor %xmm7, %xmm4 pshufd $0x4e, %xmm4, %xmm1 pshufd $0x4e, %xmm5, %xmm2 movdqa %xmm5, %xmm3 @@ -3573,28 +3573,28 @@ L_AES_GCM_init_aesni_calc_iv_16_loop: pxor %xmm0, %xmm1 pxor %xmm3, %xmm1 movdqa %xmm1, %xmm2 - movdqa %xmm0, %xmm7 + movdqa %xmm0, %xmm6 movdqa %xmm3, %xmm4 pslldq $8, %xmm2 psrldq $8, %xmm1 - pxor %xmm2, %xmm7 + pxor %xmm2, %xmm6 pxor %xmm1, %xmm4 - movdqa %xmm7, %xmm0 + movdqa %xmm6, %xmm0 movdqa %xmm4, %xmm1 psrld $31, %xmm0 psrld $31, %xmm1 - pslld $0x01, %xmm7 + pslld $0x01, %xmm6 pslld $0x01, %xmm4 movdqa %xmm0, %xmm2 pslldq $4, %xmm0 psrldq $12, %xmm2 pslldq $4, %xmm1 por %xmm2, %xmm4 - por %xmm0, %xmm7 + por %xmm0, %xmm6 por %xmm1, %xmm4 - movdqa %xmm7, %xmm0 - movdqa %xmm7, %xmm1 - movdqa %xmm7, %xmm2 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm2 pslld $31, %xmm0 pslld $30, %xmm1 pslld $25, %xmm2 @@ -3603,17 +3603,17 @@ L_AES_GCM_init_aesni_calc_iv_16_loop: movdqa %xmm0, %xmm1 psrldq $4, %xmm1 pslldq $12, %xmm0 - pxor %xmm0, %xmm7 - movdqa %xmm7, %xmm2 - movdqa %xmm7, %xmm3 - movdqa %xmm7, %xmm0 + pxor %xmm0, %xmm6 + movdqa %xmm6, %xmm2 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 psrld $0x01, %xmm2 psrld $2, %xmm3 psrld $7, %xmm0 pxor %xmm3, %xmm2 pxor %xmm0, %xmm2 pxor %xmm1, %xmm2 - pxor %xmm7, %xmm2 + pxor %xmm6, %xmm2 pxor %xmm2, %xmm4 addl $16, %ecx cmpl %edx, %ecx @@ -3623,9 +3623,9 @@ L_AES_GCM_init_aesni_calc_iv_16_loop: je L_AES_GCM_init_aesni_calc_iv_done L_AES_GCM_init_aesni_calc_iv_lt16: subq $16, %rsp - pxor %xmm8, %xmm8 + pxor %xmm7, %xmm7 xorl %r13d, %r13d - movdqu %xmm8, (%rsp) + movdqu %xmm7, (%rsp) L_AES_GCM_init_aesni_calc_iv_loop: movzbl (%r10,%rcx,1), %r12d movb %r12b, (%rsp,%r13,1) @@ -3633,10 +3633,10 @@ L_AES_GCM_init_aesni_calc_iv_loop: incl %r13d cmpl %edx, %ecx jl L_AES_GCM_init_aesni_calc_iv_loop - movdqu (%rsp), %xmm8 + movdqu (%rsp), %xmm7 addq $16, %rsp - pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 - pxor %xmm8, %xmm4 + pshufb L_aes_gcm_bswap_mask(%rip), %xmm7 + pxor %xmm7, %xmm4 pshufd $0x4e, %xmm4, %xmm1 pshufd $0x4e, %xmm5, %xmm2 movdqa %xmm5, %xmm3 @@ -3649,28 +3649,28 @@ L_AES_GCM_init_aesni_calc_iv_loop: pxor %xmm0, %xmm1 pxor %xmm3, %xmm1 movdqa %xmm1, %xmm2 - movdqa %xmm0, %xmm7 + movdqa %xmm0, %xmm6 movdqa %xmm3, %xmm4 pslldq $8, %xmm2 psrldq $8, %xmm1 - pxor %xmm2, %xmm7 + pxor %xmm2, %xmm6 pxor %xmm1, %xmm4 - movdqa %xmm7, %xmm0 + movdqa %xmm6, %xmm0 movdqa %xmm4, %xmm1 psrld $31, %xmm0 psrld $31, %xmm1 - pslld $0x01, %xmm7 + pslld $0x01, %xmm6 pslld $0x01, %xmm4 movdqa %xmm0, %xmm2 pslldq $4, %xmm0 psrldq $12, %xmm2 pslldq $4, %xmm1 por %xmm2, %xmm4 - por %xmm0, %xmm7 + por %xmm0, %xmm6 por %xmm1, %xmm4 - movdqa %xmm7, %xmm0 - movdqa %xmm7, %xmm1 - movdqa %xmm7, %xmm2 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm2 pslld $31, %xmm0 pslld $30, %xmm1 pslld $25, %xmm2 @@ -3679,17 +3679,17 @@ L_AES_GCM_init_aesni_calc_iv_loop: movdqa %xmm0, %xmm1 psrldq $4, %xmm1 pslldq $12, %xmm0 - pxor %xmm0, %xmm7 - movdqa %xmm7, %xmm2 - movdqa %xmm7, %xmm3 - movdqa %xmm7, %xmm0 + pxor %xmm0, %xmm6 + movdqa %xmm6, %xmm2 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 psrld $0x01, %xmm2 psrld $2, %xmm3 psrld $7, %xmm0 pxor %xmm3, %xmm2 pxor %xmm0, %xmm2 pxor %xmm1, %xmm2 - pxor %xmm7, %xmm2 + pxor %xmm6, %xmm2 pxor %xmm2, %xmm4 L_AES_GCM_init_aesni_calc_iv_done: # T = Encrypt counter @@ -3709,28 +3709,28 @@ L_AES_GCM_init_aesni_calc_iv_done: pxor %xmm0, %xmm1 pxor %xmm3, %xmm1 movdqa %xmm1, %xmm2 - movdqa %xmm0, %xmm7 + movdqa %xmm0, %xmm6 movdqa %xmm3, %xmm4 pslldq $8, %xmm2 psrldq $8, %xmm1 - pxor %xmm2, %xmm7 + pxor %xmm2, %xmm6 pxor %xmm1, %xmm4 - movdqa %xmm7, %xmm0 + movdqa %xmm6, %xmm0 movdqa %xmm4, %xmm1 psrld $31, %xmm0 psrld $31, %xmm1 - pslld $0x01, %xmm7 + pslld $0x01, %xmm6 pslld $0x01, %xmm4 movdqa %xmm0, %xmm2 pslldq $4, %xmm0 psrldq $12, %xmm2 pslldq $4, %xmm1 por %xmm2, %xmm4 - por %xmm0, %xmm7 + por %xmm0, %xmm6 por %xmm1, %xmm4 - movdqa %xmm7, %xmm0 - movdqa %xmm7, %xmm1 - movdqa %xmm7, %xmm2 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm2 pslld $31, %xmm0 pslld $30, %xmm1 pslld $25, %xmm2 @@ -3739,45 +3739,45 @@ L_AES_GCM_init_aesni_calc_iv_done: movdqa %xmm0, %xmm1 psrldq $4, %xmm1 pslldq $12, %xmm0 - pxor %xmm0, %xmm7 - movdqa %xmm7, %xmm2 - movdqa %xmm7, %xmm3 - movdqa %xmm7, %xmm0 + pxor %xmm0, %xmm6 + movdqa %xmm6, %xmm2 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 psrld $0x01, %xmm2 psrld $2, %xmm3 psrld $7, %xmm0 pxor %xmm3, %xmm2 pxor %xmm0, %xmm2 pxor %xmm1, %xmm2 - pxor %xmm7, %xmm2 + pxor %xmm6, %xmm2 pxor %xmm2, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm4 # Encrypt counter - movdqa (%rdi), %xmm8 - pxor %xmm4, %xmm8 - aesenc 16(%rdi), %xmm8 - aesenc 32(%rdi), %xmm8 - aesenc 48(%rdi), %xmm8 - aesenc 64(%rdi), %xmm8 - aesenc 80(%rdi), %xmm8 - aesenc 96(%rdi), %xmm8 - aesenc 112(%rdi), %xmm8 - aesenc 128(%rdi), %xmm8 - aesenc 144(%rdi), %xmm8 + movdqa (%rdi), %xmm7 + pxor %xmm4, %xmm7 + aesenc 16(%rdi), %xmm7 + aesenc 32(%rdi), %xmm7 + aesenc 48(%rdi), %xmm7 + aesenc 64(%rdi), %xmm7 + aesenc 80(%rdi), %xmm7 + aesenc 96(%rdi), %xmm7 + aesenc 112(%rdi), %xmm7 + aesenc 128(%rdi), %xmm7 + aesenc 144(%rdi), %xmm7 cmpl $11, %esi - movdqa 160(%rdi), %xmm9 + movdqa 160(%rdi), %xmm8 jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last - aesenc %xmm9, %xmm8 - aesenc 176(%rdi), %xmm8 + aesenc %xmm8, %xmm7 + aesenc 176(%rdi), %xmm7 cmpl $13, %esi - movdqa 192(%rdi), %xmm9 + movdqa 192(%rdi), %xmm8 jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last - aesenc %xmm9, %xmm8 - aesenc 208(%rdi), %xmm8 - movdqa 224(%rdi), %xmm9 + aesenc %xmm8, %xmm7 + aesenc 208(%rdi), %xmm7 + movdqa 224(%rdi), %xmm8 L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last: - aesenclast %xmm9, %xmm8 - movdqu %xmm8, %xmm15 + aesenclast %xmm8, %xmm7 + movdqu %xmm7, %xmm15 L_AES_GCM_init_aesni_iv_done: movdqa %xmm15, (%rax) pshufb L_aes_gcm_bswap_epi64(%rip), %xmm4 @@ -3809,9 +3809,9 @@ _AES_GCM_aad_update_aesni: movdqa (%rax), %xmm6 xorl %ecx, %ecx L_AES_GCM_aad_update_aesni_16_loop: - movdqu (%rdi,%rcx,1), %xmm8 - pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 - pxor %xmm8, %xmm5 + movdqu (%rdi,%rcx,1), %xmm7 + pshufb L_aes_gcm_bswap_mask(%rip), %xmm7 + pxor %xmm7, %xmm5 pshufd $0x4e, %xmm5, %xmm1 pshufd $0x4e, %xmm6, %xmm2 movdqa %xmm6, %xmm3 @@ -3888,38 +3888,38 @@ _AES_GCM_encrypt_block_aesni: #endif /* __APPLE__ */ movq %rdx, %r10 movq %rcx, %r11 - movdqu (%r8), %xmm8 - movdqa %xmm8, %xmm9 - pshufb L_aes_gcm_bswap_epi64(%rip), %xmm8 - paddd L_aes_gcm_one(%rip), %xmm9 - pxor (%rdi), %xmm8 - movdqu %xmm9, (%r8) - aesenc 16(%rdi), %xmm8 - aesenc 32(%rdi), %xmm8 - aesenc 48(%rdi), %xmm8 - aesenc 64(%rdi), %xmm8 - aesenc 80(%rdi), %xmm8 - aesenc 96(%rdi), %xmm8 - aesenc 112(%rdi), %xmm8 - aesenc 128(%rdi), %xmm8 - aesenc 144(%rdi), %xmm8 + movdqu (%r8), %xmm0 + movdqa %xmm0, %xmm1 + pshufb L_aes_gcm_bswap_epi64(%rip), %xmm0 + paddd L_aes_gcm_one(%rip), %xmm1 + pxor (%rdi), %xmm0 + movdqu %xmm1, (%r8) + aesenc 16(%rdi), %xmm0 + aesenc 32(%rdi), %xmm0 + aesenc 48(%rdi), %xmm0 + aesenc 64(%rdi), %xmm0 + aesenc 80(%rdi), %xmm0 + aesenc 96(%rdi), %xmm0 + aesenc 112(%rdi), %xmm0 + aesenc 128(%rdi), %xmm0 + aesenc 144(%rdi), %xmm0 cmpl $11, %esi - movdqa 160(%rdi), %xmm9 + movdqa 160(%rdi), %xmm1 jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last - aesenc %xmm9, %xmm8 - aesenc 176(%rdi), %xmm8 + aesenc %xmm1, %xmm0 + aesenc 176(%rdi), %xmm0 cmpl $13, %esi - movdqa 192(%rdi), %xmm9 + movdqa 192(%rdi), %xmm1 jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last - aesenc %xmm9, %xmm8 - aesenc 208(%rdi), %xmm8 - movdqa 224(%rdi), %xmm9 + aesenc %xmm1, %xmm0 + aesenc 208(%rdi), %xmm0 + movdqa 224(%rdi), %xmm1 L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last: - aesenclast %xmm9, %xmm8 - movdqu (%r11), %xmm9 - pxor %xmm9, %xmm8 - movdqu %xmm8, (%r10) - pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 + aesenclast %xmm1, %xmm0 + movdqu (%r11), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%r10) + pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 repz retq #ifndef __APPLE__ .size AES_GCM_encrypt_block_aesni,.-AES_GCM_encrypt_block_aesni @@ -3938,9 +3938,9 @@ _AES_GCM_ghash_block_aesni: #endif /* __APPLE__ */ movdqa (%rsi), %xmm4 movdqa (%rdx), %xmm5 - movdqu (%rdi), %xmm8 - pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 - pxor %xmm8, %xmm4 + movdqu (%rdi), %xmm7 + pshufb L_aes_gcm_bswap_mask(%rip), %xmm7 + pxor %xmm7, %xmm4 pshufd $0x4e, %xmm4, %xmm1 pshufd $0x4e, %xmm5, %xmm2 movdqa %xmm5, %xmm3 @@ -5243,16 +5243,16 @@ _AES_GCM_encrypt_final_aesni: movdqa (%rdi), %xmm4 movdqa (%r9), %xmm5 movdqa (%r8), %xmm6 - movdqa %xmm5, %xmm9 movdqa %xmm5, %xmm8 - psrlq $63, %xmm9 - psllq $0x01, %xmm8 - pslldq $8, %xmm9 - por %xmm9, %xmm8 + movdqa %xmm5, %xmm7 + psrlq $63, %xmm8 + psllq $0x01, %xmm7 + pslldq $8, %xmm8 + por %xmm8, %xmm7 pshufd $0xff, %xmm5, %xmm5 psrad $31, %xmm5 pand L_aes_gcm_mod2_128(%rip), %xmm5 - pxor %xmm8, %xmm5 + pxor %xmm7, %xmm5 movl %r10d, %edx movl %r11d, %ecx shlq $3, %rdx @@ -5260,46 +5260,46 @@ _AES_GCM_encrypt_final_aesni: pinsrq $0x00, %rdx, %xmm0 pinsrq $0x01, %rcx, %xmm0 pxor %xmm0, %xmm4 - pshufd $0x4e, %xmm5, %xmm9 - pshufd $0x4e, %xmm4, %xmm10 - movdqa %xmm4, %xmm11 - movdqa %xmm4, %xmm8 - pclmulqdq $0x11, %xmm5, %xmm11 - pclmulqdq $0x00, %xmm5, %xmm8 - pxor %xmm5, %xmm9 - pxor %xmm4, %xmm10 - pclmulqdq $0x00, %xmm10, %xmm9 - pxor %xmm8, %xmm9 - pxor %xmm11, %xmm9 - movdqa %xmm9, %xmm10 - movdqa %xmm11, %xmm4 - pslldq $8, %xmm10 - psrldq $8, %xmm9 + pshufd $0x4e, %xmm5, %xmm8 + pshufd $0x4e, %xmm4, %xmm9 + movdqa %xmm4, %xmm10 + movdqa %xmm4, %xmm7 + pclmulqdq $0x11, %xmm5, %xmm10 + pclmulqdq $0x00, %xmm5, %xmm7 + pxor %xmm5, %xmm8 + pxor %xmm4, %xmm9 + pclmulqdq $0x00, %xmm9, %xmm8 + pxor %xmm7, %xmm8 pxor %xmm10, %xmm8 - pxor %xmm9, %xmm4 - movdqa %xmm8, %xmm12 - movdqa %xmm8, %xmm13 - movdqa %xmm8, %xmm14 - pslld $31, %xmm12 - pslld $30, %xmm13 - pslld $25, %xmm14 - pxor %xmm13, %xmm12 - pxor %xmm14, %xmm12 - movdqa %xmm12, %xmm13 - psrldq $4, %xmm13 - pslldq $12, %xmm12 - pxor %xmm12, %xmm8 - movdqa %xmm8, %xmm14 - movdqa %xmm8, %xmm10 movdqa %xmm8, %xmm9 - psrld $0x01, %xmm14 - psrld $2, %xmm10 - psrld $7, %xmm9 - pxor %xmm10, %xmm14 - pxor %xmm9, %xmm14 - pxor %xmm13, %xmm14 - pxor %xmm8, %xmm14 - pxor %xmm14, %xmm4 + movdqa %xmm10, %xmm4 + pslldq $8, %xmm9 + psrldq $8, %xmm8 + pxor %xmm9, %xmm7 + pxor %xmm8, %xmm4 + movdqa %xmm7, %xmm11 + movdqa %xmm7, %xmm12 + movdqa %xmm7, %xmm13 + pslld $31, %xmm11 + pslld $30, %xmm12 + pslld $25, %xmm13 + pxor %xmm12, %xmm11 + pxor %xmm13, %xmm11 + movdqa %xmm11, %xmm12 + psrldq $4, %xmm12 + pslldq $12, %xmm11 + pxor %xmm11, %xmm7 + movdqa %xmm7, %xmm13 + movdqa %xmm7, %xmm9 + movdqa %xmm7, %xmm8 + psrld $0x01, %xmm13 + psrld $2, %xmm9 + psrld $7, %xmm8 + pxor %xmm9, %xmm13 + pxor %xmm8, %xmm13 + pxor %xmm12, %xmm13 + pxor %xmm7, %xmm13 + pxor %xmm13, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm4 movdqu %xmm6, %xmm0 pxor %xmm4, %xmm0 @@ -6113,16 +6113,16 @@ _AES_GCM_decrypt_final_aesni: movdqa (%rdi), %xmm6 movdqa (%r9), %xmm5 movdqa (%r8), %xmm15 - movdqa %xmm5, %xmm9 movdqa %xmm5, %xmm8 - psrlq $63, %xmm9 - psllq $0x01, %xmm8 - pslldq $8, %xmm9 - por %xmm9, %xmm8 + movdqa %xmm5, %xmm7 + psrlq $63, %xmm8 + psllq $0x01, %xmm7 + pslldq $8, %xmm8 + por %xmm8, %xmm7 pshufd $0xff, %xmm5, %xmm5 psrad $31, %xmm5 pand L_aes_gcm_mod2_128(%rip), %xmm5 - pxor %xmm8, %xmm5 + pxor %xmm7, %xmm5 movl %r10d, %edx movl %r11d, %ecx shlq $3, %rdx @@ -6130,46 +6130,46 @@ _AES_GCM_decrypt_final_aesni: pinsrq $0x00, %rdx, %xmm0 pinsrq $0x01, %rcx, %xmm0 pxor %xmm0, %xmm6 - pshufd $0x4e, %xmm5, %xmm9 - pshufd $0x4e, %xmm6, %xmm10 - movdqa %xmm6, %xmm11 - movdqa %xmm6, %xmm8 - pclmulqdq $0x11, %xmm5, %xmm11 - pclmulqdq $0x00, %xmm5, %xmm8 - pxor %xmm5, %xmm9 - pxor %xmm6, %xmm10 - pclmulqdq $0x00, %xmm10, %xmm9 - pxor %xmm8, %xmm9 - pxor %xmm11, %xmm9 - movdqa %xmm9, %xmm10 - movdqa %xmm11, %xmm6 - pslldq $8, %xmm10 - psrldq $8, %xmm9 + pshufd $0x4e, %xmm5, %xmm8 + pshufd $0x4e, %xmm6, %xmm9 + movdqa %xmm6, %xmm10 + movdqa %xmm6, %xmm7 + pclmulqdq $0x11, %xmm5, %xmm10 + pclmulqdq $0x00, %xmm5, %xmm7 + pxor %xmm5, %xmm8 + pxor %xmm6, %xmm9 + pclmulqdq $0x00, %xmm9, %xmm8 + pxor %xmm7, %xmm8 pxor %xmm10, %xmm8 - pxor %xmm9, %xmm6 - movdqa %xmm8, %xmm12 - movdqa %xmm8, %xmm13 - movdqa %xmm8, %xmm14 - pslld $31, %xmm12 - pslld $30, %xmm13 - pslld $25, %xmm14 - pxor %xmm13, %xmm12 - pxor %xmm14, %xmm12 - movdqa %xmm12, %xmm13 - psrldq $4, %xmm13 - pslldq $12, %xmm12 - pxor %xmm12, %xmm8 - movdqa %xmm8, %xmm14 - movdqa %xmm8, %xmm10 movdqa %xmm8, %xmm9 - psrld $0x01, %xmm14 - psrld $2, %xmm10 - psrld $7, %xmm9 - pxor %xmm10, %xmm14 - pxor %xmm9, %xmm14 - pxor %xmm13, %xmm14 - pxor %xmm8, %xmm14 - pxor %xmm14, %xmm6 + movdqa %xmm10, %xmm6 + pslldq $8, %xmm9 + psrldq $8, %xmm8 + pxor %xmm9, %xmm7 + pxor %xmm8, %xmm6 + movdqa %xmm7, %xmm11 + movdqa %xmm7, %xmm12 + movdqa %xmm7, %xmm13 + pslld $31, %xmm11 + pslld $30, %xmm12 + pslld $25, %xmm13 + pxor %xmm12, %xmm11 + pxor %xmm13, %xmm11 + movdqa %xmm11, %xmm12 + psrldq $4, %xmm12 + pslldq $12, %xmm11 + pxor %xmm11, %xmm7 + movdqa %xmm7, %xmm13 + movdqa %xmm7, %xmm9 + movdqa %xmm7, %xmm8 + psrld $0x01, %xmm13 + psrld $2, %xmm9 + psrld $7, %xmm8 + pxor %xmm9, %xmm13 + pxor %xmm8, %xmm13 + pxor %xmm12, %xmm13 + pxor %xmm7, %xmm13 + pxor %xmm13, %xmm6 pshufb L_aes_gcm_bswap_mask(%rip), %xmm6 movdqu %xmm15, %xmm0 pxor %xmm6, %xmm0 @@ -9133,53 +9133,53 @@ _AES_GCM_init_avx1: # H = Encrypt X(=0) and T = Encrypt counter vmovdqa (%rdi), %xmm5 vpxor %xmm5, %xmm4, %xmm1 - vmovdqa 16(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 32(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 48(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 64(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 80(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 96(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 112(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 128(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 144(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 16(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 32(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 48(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 64(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 80(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 96(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 112(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 128(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 144(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 cmpl $11, %esi - vmovdqa 160(%rdi), %xmm7 + vmovdqa 160(%rdi), %xmm6 jl L_AES_GCM_init_avx1_calc_iv_12_last - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 176(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 176(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 cmpl $13, %esi - vmovdqa 192(%rdi), %xmm7 + vmovdqa 192(%rdi), %xmm6 jl L_AES_GCM_init_avx1_calc_iv_12_last - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 208(%rdi), %xmm7 - vaesenc %xmm7, %xmm5, %xmm5 - vaesenc %xmm7, %xmm1, %xmm1 - vmovdqa 224(%rdi), %xmm7 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 208(%rdi), %xmm6 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm1, %xmm1 + vmovdqa 224(%rdi), %xmm6 L_AES_GCM_init_avx1_calc_iv_12_last: - vaesenclast %xmm7, %xmm5, %xmm5 - vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm6, %xmm5, %xmm5 + vaesenclast %xmm6, %xmm1, %xmm1 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 vmovdqu %xmm1, %xmm15 jmp L_AES_GCM_init_avx1_iv_done @@ -9197,18 +9197,18 @@ L_AES_GCM_init_avx1_iv_not_12: vaesenc 128(%rdi), %xmm5, %xmm5 vaesenc 144(%rdi), %xmm5, %xmm5 cmpl $11, %esi - vmovdqa 160(%rdi), %xmm9 + vmovdqa 160(%rdi), %xmm8 jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last - vaesenc %xmm9, %xmm5, %xmm5 + vaesenc %xmm8, %xmm5, %xmm5 vaesenc 176(%rdi), %xmm5, %xmm5 cmpl $13, %esi - vmovdqa 192(%rdi), %xmm9 + vmovdqa 192(%rdi), %xmm8 jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last - vaesenc %xmm9, %xmm5, %xmm5 + vaesenc %xmm8, %xmm5, %xmm5 vaesenc 208(%rdi), %xmm5, %xmm5 - vmovdqa 224(%rdi), %xmm9 + vmovdqa 224(%rdi), %xmm8 L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last: - vaesenclast %xmm9, %xmm5, %xmm5 + vaesenclast %xmm8, %xmm5, %xmm5 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 # Calc counter # Initialization vector @@ -9219,9 +9219,9 @@ L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last: jl L_AES_GCM_init_avx1_calc_iv_lt16 andl $0xfffffff0, %edx L_AES_GCM_init_avx1_calc_iv_16_loop: - vmovdqu (%r10,%rcx,1), %xmm8 - vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 - vpxor %xmm8, %xmm4, %xmm4 + vmovdqu (%r10,%rcx,1), %xmm7 + vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 # ghash_gfmul_avx vpshufd $0x4e, %xmm4, %xmm1 vpshufd $0x4e, %xmm5, %xmm2 @@ -9232,38 +9232,38 @@ L_AES_GCM_init_avx1_calc_iv_16_loop: vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 vpxor %xmm0, %xmm1, %xmm1 vpxor %xmm3, %xmm1, %xmm1 - vmovdqa %xmm0, %xmm7 + vmovdqa %xmm0, %xmm6 vmovdqa %xmm3, %xmm4 vpslldq $8, %xmm1, %xmm2 vpsrldq $8, %xmm1, %xmm1 - vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm2, %xmm6, %xmm6 vpxor %xmm1, %xmm4, %xmm4 - vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm0 vpsrld $31, %xmm4, %xmm1 - vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 vpslld $0x01, %xmm4, %xmm4 vpsrldq $12, %xmm0, %xmm2 vpslldq $4, %xmm0, %xmm0 vpslldq $4, %xmm1, %xmm1 vpor %xmm2, %xmm4, %xmm4 - vpor %xmm0, %xmm7, %xmm7 + vpor %xmm0, %xmm6, %xmm6 vpor %xmm1, %xmm4, %xmm4 - vpslld $31, %xmm7, %xmm0 - vpslld $30, %xmm7, %xmm1 - vpslld $25, %xmm7, %xmm2 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 vpxor %xmm1, %xmm0, %xmm0 vpxor %xmm2, %xmm0, %xmm0 vmovdqa %xmm0, %xmm1 vpsrldq $4, %xmm1, %xmm1 vpslldq $12, %xmm0, %xmm0 - vpxor %xmm0, %xmm7, %xmm7 - vpsrld $0x01, %xmm7, %xmm2 - vpsrld $2, %xmm7, %xmm3 - vpsrld $7, %xmm7, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 vpxor %xmm3, %xmm2, %xmm2 vpxor %xmm0, %xmm2, %xmm2 vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 vpxor %xmm2, %xmm4, %xmm4 addl $16, %ecx cmpl %edx, %ecx @@ -9273,9 +9273,9 @@ L_AES_GCM_init_avx1_calc_iv_16_loop: je L_AES_GCM_init_avx1_calc_iv_done L_AES_GCM_init_avx1_calc_iv_lt16: subq $16, %rsp - vpxor %xmm8, %xmm8, %xmm8 + vpxor %xmm7, %xmm7, %xmm7 xorl %r13d, %r13d - vmovdqu %xmm8, (%rsp) + vmovdqu %xmm7, (%rsp) L_AES_GCM_init_avx1_calc_iv_loop: movzbl (%r10,%rcx,1), %r12d movb %r12b, (%rsp,%r13,1) @@ -9283,10 +9283,10 @@ L_AES_GCM_init_avx1_calc_iv_loop: incl %r13d cmpl %edx, %ecx jl L_AES_GCM_init_avx1_calc_iv_loop - vmovdqu (%rsp), %xmm8 + vmovdqu (%rsp), %xmm7 addq $16, %rsp - vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 - vpxor %xmm8, %xmm4, %xmm4 + vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 # ghash_gfmul_avx vpshufd $0x4e, %xmm4, %xmm1 vpshufd $0x4e, %xmm5, %xmm2 @@ -9297,38 +9297,38 @@ L_AES_GCM_init_avx1_calc_iv_loop: vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 vpxor %xmm0, %xmm1, %xmm1 vpxor %xmm3, %xmm1, %xmm1 - vmovdqa %xmm0, %xmm7 + vmovdqa %xmm0, %xmm6 vmovdqa %xmm3, %xmm4 vpslldq $8, %xmm1, %xmm2 vpsrldq $8, %xmm1, %xmm1 - vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm2, %xmm6, %xmm6 vpxor %xmm1, %xmm4, %xmm4 - vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm0 vpsrld $31, %xmm4, %xmm1 - vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 vpslld $0x01, %xmm4, %xmm4 vpsrldq $12, %xmm0, %xmm2 vpslldq $4, %xmm0, %xmm0 vpslldq $4, %xmm1, %xmm1 vpor %xmm2, %xmm4, %xmm4 - vpor %xmm0, %xmm7, %xmm7 + vpor %xmm0, %xmm6, %xmm6 vpor %xmm1, %xmm4, %xmm4 - vpslld $31, %xmm7, %xmm0 - vpslld $30, %xmm7, %xmm1 - vpslld $25, %xmm7, %xmm2 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 vpxor %xmm1, %xmm0, %xmm0 vpxor %xmm2, %xmm0, %xmm0 vmovdqa %xmm0, %xmm1 vpsrldq $4, %xmm1, %xmm1 vpslldq $12, %xmm0, %xmm0 - vpxor %xmm0, %xmm7, %xmm7 - vpsrld $0x01, %xmm7, %xmm2 - vpsrld $2, %xmm7, %xmm3 - vpsrld $7, %xmm7, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 vpxor %xmm3, %xmm2, %xmm2 vpxor %xmm0, %xmm2, %xmm2 vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 vpxor %xmm2, %xmm4, %xmm4 L_AES_GCM_init_avx1_calc_iv_done: # T = Encrypt counter @@ -9346,66 +9346,66 @@ L_AES_GCM_init_avx1_calc_iv_done: vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 vpxor %xmm0, %xmm1, %xmm1 vpxor %xmm3, %xmm1, %xmm1 - vmovdqa %xmm0, %xmm7 + vmovdqa %xmm0, %xmm6 vmovdqa %xmm3, %xmm4 vpslldq $8, %xmm1, %xmm2 vpsrldq $8, %xmm1, %xmm1 - vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm2, %xmm6, %xmm6 vpxor %xmm1, %xmm4, %xmm4 - vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm0 vpsrld $31, %xmm4, %xmm1 - vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 vpslld $0x01, %xmm4, %xmm4 vpsrldq $12, %xmm0, %xmm2 vpslldq $4, %xmm0, %xmm0 vpslldq $4, %xmm1, %xmm1 vpor %xmm2, %xmm4, %xmm4 - vpor %xmm0, %xmm7, %xmm7 + vpor %xmm0, %xmm6, %xmm6 vpor %xmm1, %xmm4, %xmm4 - vpslld $31, %xmm7, %xmm0 - vpslld $30, %xmm7, %xmm1 - vpslld $25, %xmm7, %xmm2 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 vpxor %xmm1, %xmm0, %xmm0 vpxor %xmm2, %xmm0, %xmm0 vmovdqa %xmm0, %xmm1 vpsrldq $4, %xmm1, %xmm1 vpslldq $12, %xmm0, %xmm0 - vpxor %xmm0, %xmm7, %xmm7 - vpsrld $0x01, %xmm7, %xmm2 - vpsrld $2, %xmm7, %xmm3 - vpsrld $7, %xmm7, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 vpxor %xmm3, %xmm2, %xmm2 vpxor %xmm0, %xmm2, %xmm2 vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 vpxor %xmm2, %xmm4, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 # Encrypt counter - vmovdqa (%rdi), %xmm8 - vpxor %xmm4, %xmm8, %xmm8 - vaesenc 16(%rdi), %xmm8, %xmm8 - vaesenc 32(%rdi), %xmm8, %xmm8 - vaesenc 48(%rdi), %xmm8, %xmm8 - vaesenc 64(%rdi), %xmm8, %xmm8 - vaesenc 80(%rdi), %xmm8, %xmm8 - vaesenc 96(%rdi), %xmm8, %xmm8 - vaesenc 112(%rdi), %xmm8, %xmm8 - vaesenc 128(%rdi), %xmm8, %xmm8 - vaesenc 144(%rdi), %xmm8, %xmm8 + vmovdqa (%rdi), %xmm7 + vpxor %xmm4, %xmm7, %xmm7 + vaesenc 16(%rdi), %xmm7, %xmm7 + vaesenc 32(%rdi), %xmm7, %xmm7 + vaesenc 48(%rdi), %xmm7, %xmm7 + vaesenc 64(%rdi), %xmm7, %xmm7 + vaesenc 80(%rdi), %xmm7, %xmm7 + vaesenc 96(%rdi), %xmm7, %xmm7 + vaesenc 112(%rdi), %xmm7, %xmm7 + vaesenc 128(%rdi), %xmm7, %xmm7 + vaesenc 144(%rdi), %xmm7, %xmm7 cmpl $11, %esi - vmovdqa 160(%rdi), %xmm9 + vmovdqa 160(%rdi), %xmm8 jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last - vaesenc %xmm9, %xmm8, %xmm8 - vaesenc 176(%rdi), %xmm8, %xmm8 + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 176(%rdi), %xmm7, %xmm7 cmpl $13, %esi - vmovdqa 192(%rdi), %xmm9 + vmovdqa 192(%rdi), %xmm8 jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last - vaesenc %xmm9, %xmm8, %xmm8 - vaesenc 208(%rdi), %xmm8, %xmm8 - vmovdqa 224(%rdi), %xmm9 + vaesenc %xmm8, %xmm7, %xmm7 + vaesenc 208(%rdi), %xmm7, %xmm7 + vmovdqa 224(%rdi), %xmm8 L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last: - vaesenclast %xmm9, %xmm8, %xmm8 - vmovdqu %xmm8, %xmm15 + vaesenclast %xmm8, %xmm7, %xmm7 + vmovdqu %xmm7, %xmm15 L_AES_GCM_init_avx1_iv_done: vmovdqa %xmm15, (%rax) vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 @@ -9437,9 +9437,9 @@ _AES_GCM_aad_update_avx1: vmovdqa (%rax), %xmm6 xorl %ecx, %ecx L_AES_GCM_aad_update_avx1_16_loop: - vmovdqu (%rdi,%rcx,1), %xmm8 - vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 - vpxor %xmm8, %xmm5, %xmm5 + vmovdqu (%rdi,%rcx,1), %xmm7 + vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm5, %xmm5 # ghash_gfmul_avx vpshufd $0x4e, %xmm5, %xmm1 vpshufd $0x4e, %xmm6, %xmm2 @@ -9506,37 +9506,37 @@ _AES_GCM_encrypt_block_avx1: #endif /* __APPLE__ */ movq %rdx, %r10 movq %rcx, %r11 - vmovdqu (%r8), %xmm9 - vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 - vpaddd L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9 - vmovdqu %xmm9, (%r8) - vpxor (%rdi), %xmm8, %xmm8 - vaesenc 16(%rdi), %xmm8, %xmm8 - vaesenc 32(%rdi), %xmm8, %xmm8 - vaesenc 48(%rdi), %xmm8, %xmm8 - vaesenc 64(%rdi), %xmm8, %xmm8 - vaesenc 80(%rdi), %xmm8, %xmm8 - vaesenc 96(%rdi), %xmm8, %xmm8 - vaesenc 112(%rdi), %xmm8, %xmm8 - vaesenc 128(%rdi), %xmm8, %xmm8 - vaesenc 144(%rdi), %xmm8, %xmm8 + vmovdqu (%r8), %xmm1 + vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0 + vpaddd L_avx1_aes_gcm_one(%rip), %xmm1, %xmm1 + vmovdqu %xmm1, (%r8) + vpxor (%rdi), %xmm0, %xmm0 + vaesenc 16(%rdi), %xmm0, %xmm0 + vaesenc 32(%rdi), %xmm0, %xmm0 + vaesenc 48(%rdi), %xmm0, %xmm0 + vaesenc 64(%rdi), %xmm0, %xmm0 + vaesenc 80(%rdi), %xmm0, %xmm0 + vaesenc 96(%rdi), %xmm0, %xmm0 + vaesenc 112(%rdi), %xmm0, %xmm0 + vaesenc 128(%rdi), %xmm0, %xmm0 + vaesenc 144(%rdi), %xmm0, %xmm0 cmpl $11, %esi - vmovdqa 160(%rdi), %xmm9 + vmovdqa 160(%rdi), %xmm1 jl L_AES_GCM_encrypt_block_avx1_aesenc_block_last - vaesenc %xmm9, %xmm8, %xmm8 - vaesenc 176(%rdi), %xmm8, %xmm8 + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%rdi), %xmm0, %xmm0 cmpl $13, %esi - vmovdqa 192(%rdi), %xmm9 + vmovdqa 192(%rdi), %xmm1 jl L_AES_GCM_encrypt_block_avx1_aesenc_block_last - vaesenc %xmm9, %xmm8, %xmm8 - vaesenc 208(%rdi), %xmm8, %xmm8 - vmovdqa 224(%rdi), %xmm9 + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%rdi), %xmm0, %xmm0 + vmovdqa 224(%rdi), %xmm1 L_AES_GCM_encrypt_block_avx1_aesenc_block_last: - vaesenclast %xmm9, %xmm8, %xmm8 - vmovdqu (%r11), %xmm9 - vpxor %xmm9, %xmm8, %xmm8 - vmovdqu %xmm8, (%r10) - vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%r11), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%r10) + vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 vzeroupper repz retq #ifndef __APPLE__ @@ -9556,9 +9556,9 @@ _AES_GCM_ghash_block_avx1: #endif /* __APPLE__ */ vmovdqa (%rsi), %xmm4 vmovdqa (%rdx), %xmm5 - vmovdqu (%rdi), %xmm8 - vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8 - vpxor %xmm8, %xmm4, %xmm4 + vmovdqu (%rdi), %xmm7 + vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm7, %xmm7 + vpxor %xmm7, %xmm4, %xmm4 # ghash_gfmul_avx vpshufd $0x4e, %xmm4, %xmm1 vpshufd $0x4e, %xmm5, %xmm2 @@ -10654,14 +10654,14 @@ _AES_GCM_encrypt_final_avx1: vmovdqa (%rdi), %xmm4 vmovdqa (%r9), %xmm5 vmovdqa (%r8), %xmm6 - vpsrlq $63, %xmm5, %xmm9 - vpsllq $0x01, %xmm5, %xmm8 - vpslldq $8, %xmm9, %xmm9 - vpor %xmm9, %xmm8, %xmm8 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 vpshufd $0xff, %xmm5, %xmm5 vpsrad $31, %xmm5, %xmm5 vpand L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 - vpxor %xmm8, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 movl %r10d, %edx movl %r11d, %ecx shlq $3, %rdx @@ -10671,35 +10671,35 @@ _AES_GCM_encrypt_final_avx1: vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm0, %xmm4, %xmm4 # ghash_gfmul_red_avx - vpshufd $0x4e, %xmm5, %xmm9 - vpshufd $0x4e, %xmm4, %xmm10 - vpclmulqdq $0x11, %xmm5, %xmm4, %xmm11 - vpclmulqdq $0x00, %xmm5, %xmm4, %xmm8 - vpxor %xmm5, %xmm9, %xmm9 - vpxor %xmm4, %xmm10, %xmm10 - vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 - vpxor %xmm8, %xmm9, %xmm9 - vpxor %xmm11, %xmm9, %xmm9 - vpslldq $8, %xmm9, %xmm10 - vpsrldq $8, %xmm9, %xmm9 + vpshufd $0x4e, %xmm5, %xmm8 + vpshufd $0x4e, %xmm4, %xmm9 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm7 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm4, %xmm9, %xmm9 + vpclmulqdq $0x00, %xmm9, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 vpxor %xmm10, %xmm8, %xmm8 - vpxor %xmm9, %xmm11, %xmm4 - vpslld $31, %xmm8, %xmm12 - vpslld $30, %xmm8, %xmm13 - vpslld $25, %xmm8, %xmm14 - vpxor %xmm13, %xmm12, %xmm12 - vpxor %xmm14, %xmm12, %xmm12 - vpsrldq $4, %xmm12, %xmm13 - vpslldq $12, %xmm12, %xmm12 - vpxor %xmm12, %xmm8, %xmm8 - vpsrld $0x01, %xmm8, %xmm14 - vpsrld $2, %xmm8, %xmm10 - vpsrld $7, %xmm8, %xmm9 - vpxor %xmm10, %xmm14, %xmm14 - vpxor %xmm9, %xmm14, %xmm14 - vpxor %xmm13, %xmm14, %xmm14 - vpxor %xmm8, %xmm14, %xmm14 - vpxor %xmm14, %xmm4, %xmm4 + vpslldq $8, %xmm8, %xmm9 + vpsrldq $8, %xmm8, %xmm8 + vpxor %xmm9, %xmm7, %xmm7 + vpxor %xmm8, %xmm10, %xmm4 + vpslld $31, %xmm7, %xmm11 + vpslld $30, %xmm7, %xmm12 + vpslld $25, %xmm7, %xmm13 + vpxor %xmm12, %xmm11, %xmm11 + vpxor %xmm13, %xmm11, %xmm11 + vpsrldq $4, %xmm11, %xmm12 + vpslldq $12, %xmm11, %xmm11 + vpxor %xmm11, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm13 + vpsrld $2, %xmm7, %xmm9 + vpsrld $7, %xmm7, %xmm8 + vpxor %xmm9, %xmm13, %xmm13 + vpxor %xmm8, %xmm13, %xmm13 + vpxor %xmm12, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + vpxor %xmm13, %xmm4, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4 vpxor %xmm6, %xmm4, %xmm0 cmpl $16, %eax @@ -11356,14 +11356,14 @@ _AES_GCM_decrypt_final_avx1: vmovdqa (%rdi), %xmm6 vmovdqa (%r9), %xmm5 vmovdqa (%r8), %xmm15 - vpsrlq $63, %xmm5, %xmm9 - vpsllq $0x01, %xmm5, %xmm8 - vpslldq $8, %xmm9, %xmm9 - vpor %xmm9, %xmm8, %xmm8 + vpsrlq $63, %xmm5, %xmm8 + vpsllq $0x01, %xmm5, %xmm7 + vpslldq $8, %xmm8, %xmm8 + vpor %xmm8, %xmm7, %xmm7 vpshufd $0xff, %xmm5, %xmm5 vpsrad $31, %xmm5, %xmm5 vpand L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 - vpxor %xmm8, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 movl %r10d, %edx movl %r11d, %ecx shlq $3, %rdx @@ -11373,35 +11373,35 @@ _AES_GCM_decrypt_final_avx1: vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm0, %xmm6, %xmm6 # ghash_gfmul_red_avx - vpshufd $0x4e, %xmm5, %xmm9 - vpshufd $0x4e, %xmm6, %xmm10 - vpclmulqdq $0x11, %xmm5, %xmm6, %xmm11 - vpclmulqdq $0x00, %xmm5, %xmm6, %xmm8 - vpxor %xmm5, %xmm9, %xmm9 - vpxor %xmm6, %xmm10, %xmm10 - vpclmulqdq $0x00, %xmm10, %xmm9, %xmm9 - vpxor %xmm8, %xmm9, %xmm9 - vpxor %xmm11, %xmm9, %xmm9 - vpslldq $8, %xmm9, %xmm10 - vpsrldq $8, %xmm9, %xmm9 + vpshufd $0x4e, %xmm5, %xmm8 + vpshufd $0x4e, %xmm6, %xmm9 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm10 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm7 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm9, %xmm9 + vpclmulqdq $0x00, %xmm9, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 vpxor %xmm10, %xmm8, %xmm8 - vpxor %xmm9, %xmm11, %xmm6 - vpslld $31, %xmm8, %xmm12 - vpslld $30, %xmm8, %xmm13 - vpslld $25, %xmm8, %xmm14 - vpxor %xmm13, %xmm12, %xmm12 - vpxor %xmm14, %xmm12, %xmm12 - vpsrldq $4, %xmm12, %xmm13 - vpslldq $12, %xmm12, %xmm12 - vpxor %xmm12, %xmm8, %xmm8 - vpsrld $0x01, %xmm8, %xmm14 - vpsrld $2, %xmm8, %xmm10 - vpsrld $7, %xmm8, %xmm9 - vpxor %xmm10, %xmm14, %xmm14 - vpxor %xmm9, %xmm14, %xmm14 - vpxor %xmm13, %xmm14, %xmm14 - vpxor %xmm8, %xmm14, %xmm14 - vpxor %xmm14, %xmm6, %xmm6 + vpslldq $8, %xmm8, %xmm9 + vpsrldq $8, %xmm8, %xmm8 + vpxor %xmm9, %xmm7, %xmm7 + vpxor %xmm8, %xmm10, %xmm6 + vpslld $31, %xmm7, %xmm11 + vpslld $30, %xmm7, %xmm12 + vpslld $25, %xmm7, %xmm13 + vpxor %xmm12, %xmm11, %xmm11 + vpxor %xmm13, %xmm11, %xmm11 + vpsrldq $4, %xmm11, %xmm12 + vpslldq $12, %xmm11, %xmm11 + vpxor %xmm11, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm13 + vpsrld $2, %xmm7, %xmm9 + vpsrld $7, %xmm7, %xmm8 + vpxor %xmm9, %xmm13, %xmm13 + vpxor %xmm8, %xmm13, %xmm13 + vpxor %xmm12, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + vpxor %xmm13, %xmm6, %xmm6 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 vpxor %xmm15, %xmm6, %xmm0 cmpl $16, %eax diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index 031a0961c..ff52a93db 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -115,7 +115,17 @@ AES_GCM_encrypt PROC mov r14d, DWORD PTR [rsp+128] mov r15, QWORD PTR [rsp+136] mov r10d, DWORD PTR [rsp+144] - sub rsp, 160 + sub rsp, 320 + movdqu [rsp+160], xmm6 + movdqu [rsp+176], xmm7 + movdqu [rsp+192], xmm8 + movdqu [rsp+208], xmm9 + movdqu [rsp+224], xmm10 + movdqu [rsp+240], xmm11 + movdqu [rsp+256], xmm12 + movdqu [rsp+272], xmm13 + movdqu [rsp+288], xmm14 + movdqu [rsp+304], xmm15 pxor xmm4, xmm4 pxor xmm6, xmm6 cmp ebx, 12 @@ -1935,7 +1945,17 @@ L_AES_GCM_encrypt_store_tag_loop: L_AES_GCM_encrypt_store_tag_16: movdqu [r8], xmm0 L_AES_GCM_encrypt_store_tag_done: - add rsp, 160 + movdqu xmm6, [rsp+160] + movdqu xmm7, [rsp+176] + movdqu xmm8, [rsp+192] + movdqu xmm9, [rsp+208] + movdqu xmm10, [rsp+224] + movdqu xmm11, [rsp+240] + movdqu xmm12, [rsp+256] + movdqu xmm13, [rsp+272] + movdqu xmm14, [rsp+288] + movdqu xmm15, [rsp+304] + add rsp, 320 pop r15 pop r14 pop rbx @@ -1968,7 +1988,17 @@ AES_GCM_decrypt PROC mov r15, QWORD PTR [rsp+144] mov r10d, DWORD PTR [rsp+152] mov rbp, QWORD PTR [rsp+160] - sub rsp, 168 + sub rsp, 328 + movdqu [rsp+168], xmm6 + movdqu [rsp+184], xmm7 + movdqu [rsp+200], xmm8 + movdqu [rsp+216], xmm9 + movdqu [rsp+232], xmm10 + movdqu [rsp+248], xmm11 + movdqu [rsp+264], xmm12 + movdqu [rsp+280], xmm13 + movdqu [rsp+296], xmm14 + movdqu [rsp+312], xmm15 pxor xmm4, xmm4 pxor xmm6, xmm6 cmp ebx, 12 @@ -3337,7 +3367,17 @@ L_AES_GCM_decrypt_cmp_tag_16: sete bl L_AES_GCM_decrypt_cmp_tag_done: mov DWORD PTR [rbp], ebx - add rsp, 168 + movdqu xmm6, [rsp+168] + movdqu xmm7, [rsp+184] + movdqu xmm8, [rsp+200] + movdqu xmm9, [rsp+216] + movdqu xmm10, [rsp+232] + movdqu xmm11, [rsp+248] + movdqu xmm12, [rsp+264] + movdqu xmm13, [rsp+280] + movdqu xmm14, [rsp+296] + movdqu xmm15, [rsp+312] + add rsp, 328 pop rbp pop r15 pop r14 @@ -3363,7 +3403,11 @@ AES_GCM_init_aesni PROC mov rax, QWORD PTR [rsp+80] mov r8, QWORD PTR [rsp+88] mov r9, QWORD PTR [rsp+96] - sub rsp, 16 + sub rsp, 80 + movdqu [rsp+16], xmm6 + movdqu [rsp+32], xmm7 + movdqu [rsp+48], xmm8 + movdqu [rsp+64], xmm15 pxor xmm4, xmm4 mov edx, r11d cmp edx, 12 @@ -3378,53 +3422,53 @@ AES_GCM_init_aesni PROC movdqa xmm1, xmm4 movdqa xmm5, OWORD PTR [rdi] pxor xmm1, xmm5 - movdqa xmm7, OWORD PTR [rdi+16] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+32] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+48] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+64] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+80] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+96] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+112] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+128] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+144] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 + movdqa xmm6, OWORD PTR [rdi+16] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+32] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+48] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+64] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+80] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+96] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+112] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+128] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+144] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 cmp esi, 11 - movdqa xmm7, OWORD PTR [rdi+160] + movdqa xmm6, OWORD PTR [rdi+160] jl L_AES_GCM_init_aesni_calc_iv_12_last - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+176] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+176] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 cmp esi, 13 - movdqa xmm7, OWORD PTR [rdi+192] + movdqa xmm6, OWORD PTR [rdi+192] jl L_AES_GCM_init_aesni_calc_iv_12_last - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+208] - aesenc xmm5, xmm7 - aesenc xmm1, xmm7 - movdqa xmm7, OWORD PTR [rdi+224] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+208] + aesenc xmm5, xmm6 + aesenc xmm1, xmm6 + movdqa xmm6, OWORD PTR [rdi+224] L_AES_GCM_init_aesni_calc_iv_12_last: - aesenclast xmm5, xmm7 - aesenclast xmm1, xmm7 + aesenclast xmm5, xmm6 + aesenclast xmm1, xmm6 pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask movdqu xmm15, xmm1 jmp L_AES_GCM_init_aesni_iv_done @@ -3442,18 +3486,18 @@ L_AES_GCM_init_aesni_iv_not_12: aesenc xmm5, [rdi+128] aesenc xmm5, [rdi+144] cmp esi, 11 - movdqa xmm9, OWORD PTR [rdi+160] + movdqa xmm8, OWORD PTR [rdi+160] jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last - aesenc xmm5, xmm9 + aesenc xmm5, xmm8 aesenc xmm5, [rdi+176] cmp esi, 13 - movdqa xmm9, OWORD PTR [rdi+192] + movdqa xmm8, OWORD PTR [rdi+192] jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last - aesenc xmm5, xmm9 + aesenc xmm5, xmm8 aesenc xmm5, [rdi+208] - movdqa xmm9, OWORD PTR [rdi+224] + movdqa xmm8, OWORD PTR [rdi+224] L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last: - aesenclast xmm5, xmm9 + aesenclast xmm5, xmm8 pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask ; Calc counter ; Initialization vector @@ -3464,9 +3508,9 @@ L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last: jl L_AES_GCM_init_aesni_calc_iv_lt16 and edx, 4294967280 L_AES_GCM_init_aesni_calc_iv_16_loop: - movdqu xmm8, [r10+rcx] - pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask - pxor xmm4, xmm8 + movdqu xmm7, [r10+rcx] + pshufb xmm7, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 pshufd xmm1, xmm4, 78 pshufd xmm2, xmm5, 78 movdqa xmm3, xmm5 @@ -3479,28 +3523,28 @@ L_AES_GCM_init_aesni_calc_iv_16_loop: pxor xmm1, xmm0 pxor xmm1, xmm3 movdqa xmm2, xmm1 - movdqa xmm7, xmm0 + movdqa xmm6, xmm0 movdqa xmm4, xmm3 pslldq xmm2, 8 psrldq xmm1, 8 - pxor xmm7, xmm2 + pxor xmm6, xmm2 pxor xmm4, xmm1 - movdqa xmm0, xmm7 + movdqa xmm0, xmm6 movdqa xmm1, xmm4 psrld xmm0, 31 psrld xmm1, 31 - pslld xmm7, 1 + pslld xmm6, 1 pslld xmm4, 1 movdqa xmm2, xmm0 pslldq xmm0, 4 psrldq xmm2, 12 pslldq xmm1, 4 por xmm4, xmm2 - por xmm7, xmm0 + por xmm6, xmm0 por xmm4, xmm1 - movdqa xmm0, xmm7 - movdqa xmm1, xmm7 - movdqa xmm2, xmm7 + movdqa xmm0, xmm6 + movdqa xmm1, xmm6 + movdqa xmm2, xmm6 pslld xmm0, 31 pslld xmm1, 30 pslld xmm2, 25 @@ -3509,17 +3553,17 @@ L_AES_GCM_init_aesni_calc_iv_16_loop: movdqa xmm1, xmm0 psrldq xmm1, 4 pslldq xmm0, 12 - pxor xmm7, xmm0 - movdqa xmm2, xmm7 - movdqa xmm3, xmm7 - movdqa xmm0, xmm7 + pxor xmm6, xmm0 + movdqa xmm2, xmm6 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 psrld xmm2, 1 psrld xmm3, 2 psrld xmm0, 7 pxor xmm2, xmm3 pxor xmm2, xmm0 pxor xmm2, xmm1 - pxor xmm2, xmm7 + pxor xmm2, xmm6 pxor xmm4, xmm2 add ecx, 16 cmp ecx, edx @@ -3529,9 +3573,9 @@ L_AES_GCM_init_aesni_calc_iv_16_loop: je L_AES_GCM_init_aesni_calc_iv_done L_AES_GCM_init_aesni_calc_iv_lt16: sub rsp, 16 - pxor xmm8, xmm8 + pxor xmm7, xmm7 xor r13d, r13d - movdqu [rsp], xmm8 + movdqu [rsp], xmm7 L_AES_GCM_init_aesni_calc_iv_loop: movzx r12d, BYTE PTR [r10+rcx] mov BYTE PTR [rsp+r13], r12b @@ -3539,10 +3583,10 @@ L_AES_GCM_init_aesni_calc_iv_loop: inc r13d cmp ecx, edx jl L_AES_GCM_init_aesni_calc_iv_loop - movdqu xmm8, [rsp] + movdqu xmm7, [rsp] add rsp, 16 - pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask - pxor xmm4, xmm8 + pshufb xmm7, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 pshufd xmm1, xmm4, 78 pshufd xmm2, xmm5, 78 movdqa xmm3, xmm5 @@ -3555,28 +3599,28 @@ L_AES_GCM_init_aesni_calc_iv_loop: pxor xmm1, xmm0 pxor xmm1, xmm3 movdqa xmm2, xmm1 - movdqa xmm7, xmm0 + movdqa xmm6, xmm0 movdqa xmm4, xmm3 pslldq xmm2, 8 psrldq xmm1, 8 - pxor xmm7, xmm2 + pxor xmm6, xmm2 pxor xmm4, xmm1 - movdqa xmm0, xmm7 + movdqa xmm0, xmm6 movdqa xmm1, xmm4 psrld xmm0, 31 psrld xmm1, 31 - pslld xmm7, 1 + pslld xmm6, 1 pslld xmm4, 1 movdqa xmm2, xmm0 pslldq xmm0, 4 psrldq xmm2, 12 pslldq xmm1, 4 por xmm4, xmm2 - por xmm7, xmm0 + por xmm6, xmm0 por xmm4, xmm1 - movdqa xmm0, xmm7 - movdqa xmm1, xmm7 - movdqa xmm2, xmm7 + movdqa xmm0, xmm6 + movdqa xmm1, xmm6 + movdqa xmm2, xmm6 pslld xmm0, 31 pslld xmm1, 30 pslld xmm2, 25 @@ -3585,17 +3629,17 @@ L_AES_GCM_init_aesni_calc_iv_loop: movdqa xmm1, xmm0 psrldq xmm1, 4 pslldq xmm0, 12 - pxor xmm7, xmm0 - movdqa xmm2, xmm7 - movdqa xmm3, xmm7 - movdqa xmm0, xmm7 + pxor xmm6, xmm0 + movdqa xmm2, xmm6 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 psrld xmm2, 1 psrld xmm3, 2 psrld xmm0, 7 pxor xmm2, xmm3 pxor xmm2, xmm0 pxor xmm2, xmm1 - pxor xmm2, xmm7 + pxor xmm2, xmm6 pxor xmm4, xmm2 L_AES_GCM_init_aesni_calc_iv_done: ; T = Encrypt counter @@ -3615,28 +3659,28 @@ L_AES_GCM_init_aesni_calc_iv_done: pxor xmm1, xmm0 pxor xmm1, xmm3 movdqa xmm2, xmm1 - movdqa xmm7, xmm0 + movdqa xmm6, xmm0 movdqa xmm4, xmm3 pslldq xmm2, 8 psrldq xmm1, 8 - pxor xmm7, xmm2 + pxor xmm6, xmm2 pxor xmm4, xmm1 - movdqa xmm0, xmm7 + movdqa xmm0, xmm6 movdqa xmm1, xmm4 psrld xmm0, 31 psrld xmm1, 31 - pslld xmm7, 1 + pslld xmm6, 1 pslld xmm4, 1 movdqa xmm2, xmm0 pslldq xmm0, 4 psrldq xmm2, 12 pslldq xmm1, 4 por xmm4, xmm2 - por xmm7, xmm0 + por xmm6, xmm0 por xmm4, xmm1 - movdqa xmm0, xmm7 - movdqa xmm1, xmm7 - movdqa xmm2, xmm7 + movdqa xmm0, xmm6 + movdqa xmm1, xmm6 + movdqa xmm2, xmm6 pslld xmm0, 31 pslld xmm1, 30 pslld xmm2, 25 @@ -3645,52 +3689,56 @@ L_AES_GCM_init_aesni_calc_iv_done: movdqa xmm1, xmm0 psrldq xmm1, 4 pslldq xmm0, 12 - pxor xmm7, xmm0 - movdqa xmm2, xmm7 - movdqa xmm3, xmm7 - movdqa xmm0, xmm7 + pxor xmm6, xmm0 + movdqa xmm2, xmm6 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 psrld xmm2, 1 psrld xmm3, 2 psrld xmm0, 7 pxor xmm2, xmm3 pxor xmm2, xmm0 pxor xmm2, xmm1 - pxor xmm2, xmm7 + pxor xmm2, xmm6 pxor xmm4, xmm2 pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask ; Encrypt counter - movdqa xmm8, OWORD PTR [rdi] - pxor xmm8, xmm4 - aesenc xmm8, [rdi+16] - aesenc xmm8, [rdi+32] - aesenc xmm8, [rdi+48] - aesenc xmm8, [rdi+64] - aesenc xmm8, [rdi+80] - aesenc xmm8, [rdi+96] - aesenc xmm8, [rdi+112] - aesenc xmm8, [rdi+128] - aesenc xmm8, [rdi+144] + movdqa xmm7, OWORD PTR [rdi] + pxor xmm7, xmm4 + aesenc xmm7, [rdi+16] + aesenc xmm7, [rdi+32] + aesenc xmm7, [rdi+48] + aesenc xmm7, [rdi+64] + aesenc xmm7, [rdi+80] + aesenc xmm7, [rdi+96] + aesenc xmm7, [rdi+112] + aesenc xmm7, [rdi+128] + aesenc xmm7, [rdi+144] cmp esi, 11 - movdqa xmm9, OWORD PTR [rdi+160] + movdqa xmm8, OWORD PTR [rdi+160] jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last - aesenc xmm8, xmm9 - aesenc xmm8, [rdi+176] + aesenc xmm7, xmm8 + aesenc xmm7, [rdi+176] cmp esi, 13 - movdqa xmm9, OWORD PTR [rdi+192] + movdqa xmm8, OWORD PTR [rdi+192] jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last - aesenc xmm8, xmm9 - aesenc xmm8, [rdi+208] - movdqa xmm9, OWORD PTR [rdi+224] + aesenc xmm7, xmm8 + aesenc xmm7, [rdi+208] + movdqa xmm8, OWORD PTR [rdi+224] L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last: - aesenclast xmm8, xmm9 - movdqu xmm15, xmm8 + aesenclast xmm7, xmm8 + movdqu xmm15, xmm7 L_AES_GCM_init_aesni_iv_done: movdqa OWORD PTR [r9], xmm15 pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 paddd xmm4, OWORD PTR L_aes_gcm_one movdqa OWORD PTR [rax], xmm5 movdqa OWORD PTR [r8], xmm4 - add rsp, 16 + movdqu xmm6, [rsp+16] + movdqu xmm7, [rsp+32] + movdqu xmm8, [rsp+48] + movdqu xmm15, [rsp+64] + add rsp, 80 pop r14 pop r13 pop r12 @@ -3702,13 +3750,16 @@ _text ENDS _text SEGMENT READONLY PARA AES_GCM_aad_update_aesni PROC mov rax, rcx + sub rsp, 32 + movdqu [rsp], xmm6 + movdqu [rsp+16], xmm7 movdqa xmm5, OWORD PTR [r8] movdqa xmm6, OWORD PTR [r9] xor ecx, ecx L_AES_GCM_aad_update_aesni_16_loop: - movdqu xmm8, [rax+rcx] - pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask - pxor xmm5, xmm8 + movdqu xmm7, [rax+rcx] + pshufb xmm7, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm5, xmm7 pshufd xmm1, xmm5, 78 pshufd xmm2, xmm6, 78 movdqa xmm3, xmm6 @@ -3767,6 +3818,9 @@ L_AES_GCM_aad_update_aesni_16_loop: cmp ecx, edx jl L_AES_GCM_aad_update_aesni_16_loop movdqa OWORD PTR [r8], xmm5 + movdqu xmm6, [rsp] + movdqu xmm7, [rsp+16] + add rsp, 32 ret AES_GCM_aad_update_aesni ENDP _text ENDS @@ -3775,48 +3829,51 @@ AES_GCM_encrypt_block_aesni PROC mov r10, r8 mov r11, r9 mov rax, QWORD PTR [rsp+40] - movdqu xmm8, [rax] - movdqa xmm9, xmm8 - pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 - paddd xmm9, OWORD PTR L_aes_gcm_one - pxor xmm8, [rcx] - movdqu [rax], xmm9 - aesenc xmm8, [rcx+16] - aesenc xmm8, [rcx+32] - aesenc xmm8, [rcx+48] - aesenc xmm8, [rcx+64] - aesenc xmm8, [rcx+80] - aesenc xmm8, [rcx+96] - aesenc xmm8, [rcx+112] - aesenc xmm8, [rcx+128] - aesenc xmm8, [rcx+144] + movdqu xmm0, [rax] + movdqa xmm1, xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm1, OWORD PTR L_aes_gcm_one + pxor xmm0, [rcx] + movdqu [rax], xmm1 + aesenc xmm0, [rcx+16] + aesenc xmm0, [rcx+32] + aesenc xmm0, [rcx+48] + aesenc xmm0, [rcx+64] + aesenc xmm0, [rcx+80] + aesenc xmm0, [rcx+96] + aesenc xmm0, [rcx+112] + aesenc xmm0, [rcx+128] + aesenc xmm0, [rcx+144] cmp edx, 11 - movdqa xmm9, OWORD PTR [rcx+160] + movdqa xmm1, OWORD PTR [rcx+160] jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last - aesenc xmm8, xmm9 - aesenc xmm8, [rcx+176] + aesenc xmm0, xmm1 + aesenc xmm0, [rcx+176] cmp edx, 13 - movdqa xmm9, OWORD PTR [rcx+192] + movdqa xmm1, OWORD PTR [rcx+192] jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last - aesenc xmm8, xmm9 - aesenc xmm8, [rcx+208] - movdqa xmm9, OWORD PTR [rcx+224] + aesenc xmm0, xmm1 + aesenc xmm0, [rcx+208] + movdqa xmm1, OWORD PTR [rcx+224] L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last: - aesenclast xmm8, xmm9 - movdqu xmm9, [r11] - pxor xmm8, xmm9 - movdqu [r10], xmm8 - pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + aesenclast xmm0, xmm1 + movdqu xmm1, [r11] + pxor xmm0, xmm1 + movdqu [r10], xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask ret AES_GCM_encrypt_block_aesni ENDP _text ENDS _text SEGMENT READONLY PARA AES_GCM_ghash_block_aesni PROC + sub rsp, 32 + movdqu [rsp], xmm6 + movdqu [rsp+16], xmm7 movdqa xmm4, OWORD PTR [rdx] movdqa xmm5, OWORD PTR [r8] - movdqu xmm8, [rcx] - pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask - pxor xmm4, xmm8 + movdqu xmm7, [rcx] + pshufb xmm7, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 pshufd xmm1, xmm4, 78 pshufd xmm2, xmm5, 78 movdqa xmm3, xmm5 @@ -3872,6 +3929,9 @@ AES_GCM_ghash_block_aesni PROC pxor xmm2, xmm6 pxor xmm4, xmm2 movdqa OWORD PTR [rdx], xmm4 + movdqu xmm6, [rsp] + movdqu xmm7, [rsp+16] + add rsp, 32 ret AES_GCM_ghash_block_aesni ENDP _text ENDS @@ -3890,7 +3950,17 @@ AES_GCM_encrypt_update_aesni PROC mov r12, QWORD PTR [rsp+88] mov r14, QWORD PTR [rsp+96] mov r15, QWORD PTR [rsp+104] - sub rsp, 160 + sub rsp, 320 + movdqu [rsp+160], xmm6 + movdqu [rsp+176], xmm7 + movdqu [rsp+192], xmm8 + movdqu [rsp+208], xmm9 + movdqu [rsp+224], xmm10 + movdqu [rsp+240], xmm11 + movdqu [rsp+256], xmm12 + movdqu [rsp+272], xmm13 + movdqu [rsp+288], xmm14 + movdqu [rsp+304], xmm15 movdqa xmm6, OWORD PTR [r12] movdqa xmm5, OWORD PTR [r14] movdqa xmm9, xmm5 @@ -5085,7 +5155,17 @@ L_AES_GCM_encrypt_update_aesni_last_block_ghash: L_AES_GCM_encrypt_update_aesni_last_block_done: L_AES_GCM_encrypt_update_aesni_done_enc: movdqa OWORD PTR [r12], xmm6 - add rsp, 160 + movdqu xmm6, [rsp+160] + movdqu xmm7, [rsp+176] + movdqu xmm8, [rsp+192] + movdqu xmm9, [rsp+208] + movdqu xmm10, [rsp+224] + movdqu xmm11, [rsp+240] + movdqu xmm12, [rsp+256] + movdqu xmm13, [rsp+272] + movdqu xmm14, [rsp+288] + movdqu xmm15, [rsp+304] + add rsp, 320 pop rdi pop r15 pop r14 @@ -5105,20 +5185,28 @@ AES_GCM_encrypt_final_aesni PROC mov r11d, DWORD PTR [rsp+64] mov r12, QWORD PTR [rsp+72] mov r14, QWORD PTR [rsp+80] - sub rsp, 16 + sub rsp, 144 + movdqu [rsp+16], xmm6 + movdqu [rsp+32], xmm7 + movdqu [rsp+48], xmm8 + movdqu [rsp+64], xmm9 + movdqu [rsp+80], xmm10 + movdqu [rsp+96], xmm11 + movdqu [rsp+112], xmm12 + movdqu [rsp+128], xmm13 movdqa xmm4, OWORD PTR [rax] movdqa xmm5, OWORD PTR [r12] movdqa xmm6, OWORD PTR [r14] - movdqa xmm9, xmm5 movdqa xmm8, xmm5 - psrlq xmm9, 63 - psllq xmm8, 1 - pslldq xmm9, 8 - por xmm8, xmm9 + movdqa xmm7, xmm5 + psrlq xmm8, 63 + psllq xmm7, 1 + pslldq xmm8, 8 + por xmm7, xmm8 pshufd xmm5, xmm5, 255 psrad xmm5, 31 pand xmm5, OWORD PTR L_aes_gcm_mod2_128 - pxor xmm5, xmm8 + pxor xmm5, xmm7 mov edx, r10d mov ecx, r11d shl rdx, 3 @@ -5126,46 +5214,46 @@ AES_GCM_encrypt_final_aesni PROC pinsrq xmm0, rdx, 0 pinsrq xmm0, rcx, 1 pxor xmm4, xmm0 - pshufd xmm9, xmm5, 78 - pshufd xmm10, xmm4, 78 - movdqa xmm11, xmm4 - movdqa xmm8, xmm4 - pclmulqdq xmm11, xmm5, 17 - pclmulqdq xmm8, xmm5, 0 - pxor xmm9, xmm5 - pxor xmm10, xmm4 - pclmulqdq xmm9, xmm10, 0 - pxor xmm9, xmm8 - pxor xmm9, xmm11 - movdqa xmm10, xmm9 - movdqa xmm4, xmm11 - pslldq xmm10, 8 - psrldq xmm9, 8 + pshufd xmm8, xmm5, 78 + pshufd xmm9, xmm4, 78 + movdqa xmm10, xmm4 + movdqa xmm7, xmm4 + pclmulqdq xmm10, xmm5, 17 + pclmulqdq xmm7, xmm5, 0 + pxor xmm8, xmm5 + pxor xmm9, xmm4 + pclmulqdq xmm8, xmm9, 0 + pxor xmm8, xmm7 pxor xmm8, xmm10 - pxor xmm4, xmm9 - movdqa xmm12, xmm8 - movdqa xmm13, xmm8 - movdqa xmm14, xmm8 - pslld xmm12, 31 - pslld xmm13, 30 - pslld xmm14, 25 - pxor xmm12, xmm13 - pxor xmm12, xmm14 - movdqa xmm13, xmm12 - psrldq xmm13, 4 - pslldq xmm12, 12 - pxor xmm8, xmm12 - movdqa xmm14, xmm8 - movdqa xmm10, xmm8 movdqa xmm9, xmm8 - psrld xmm14, 1 - psrld xmm10, 2 - psrld xmm9, 7 - pxor xmm14, xmm10 - pxor xmm14, xmm9 - pxor xmm14, xmm13 - pxor xmm14, xmm8 - pxor xmm4, xmm14 + movdqa xmm4, xmm10 + pslldq xmm9, 8 + psrldq xmm8, 8 + pxor xmm7, xmm9 + pxor xmm4, xmm8 + movdqa xmm11, xmm7 + movdqa xmm12, xmm7 + movdqa xmm13, xmm7 + pslld xmm11, 31 + pslld xmm12, 30 + pslld xmm13, 25 + pxor xmm11, xmm12 + pxor xmm11, xmm13 + movdqa xmm12, xmm11 + psrldq xmm12, 4 + pslldq xmm11, 12 + pxor xmm7, xmm11 + movdqa xmm13, xmm7 + movdqa xmm9, xmm7 + movdqa xmm8, xmm7 + psrld xmm13, 1 + psrld xmm9, 2 + psrld xmm8, 7 + pxor xmm13, xmm9 + pxor xmm13, xmm8 + pxor xmm13, xmm12 + pxor xmm13, xmm7 + pxor xmm4, xmm13 pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask movdqu xmm0, xmm6 pxor xmm0, xmm4 @@ -5183,7 +5271,15 @@ L_AES_GCM_encrypt_final_aesni_store_tag_loop: L_AES_GCM_encrypt_final_aesni_store_tag_16: movdqu [r9], xmm0 L_AES_GCM_encrypt_final_aesni_store_tag_done: - add rsp, 16 + movdqu xmm6, [rsp+16] + movdqu xmm7, [rsp+32] + movdqu xmm8, [rsp+48] + movdqu xmm9, [rsp+64] + movdqu xmm10, [rsp+80] + movdqu xmm11, [rsp+96] + movdqu xmm12, [rsp+112] + movdqu xmm13, [rsp+128] + add rsp, 144 pop r14 pop r12 pop r13 @@ -5206,7 +5302,17 @@ AES_GCM_decrypt_update_aesni PROC mov r12, QWORD PTR [rsp+96] mov r14, QWORD PTR [rsp+104] mov r15, QWORD PTR [rsp+112] - sub rsp, 168 + sub rsp, 328 + movdqu [rsp+168], xmm6 + movdqu [rsp+184], xmm7 + movdqu [rsp+200], xmm8 + movdqu [rsp+216], xmm9 + movdqu [rsp+232], xmm10 + movdqu [rsp+248], xmm11 + movdqu [rsp+264], xmm12 + movdqu [rsp+280], xmm13 + movdqu [rsp+296], xmm14 + movdqu [rsp+312], xmm15 movdqa xmm6, OWORD PTR [r12] movdqa xmm5, OWORD PTR [r14] movdqa xmm9, xmm5 @@ -5943,7 +6049,17 @@ L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last: L_AES_GCM_decrypt_update_aesni_last_block_done: L_AES_GCM_decrypt_update_aesni_done_dec: movdqa OWORD PTR [r12], xmm6 - add rsp, 168 + movdqu xmm6, [rsp+168] + movdqu xmm7, [rsp+184] + movdqu xmm8, [rsp+200] + movdqu xmm9, [rsp+216] + movdqu xmm10, [rsp+232] + movdqu xmm11, [rsp+248] + movdqu xmm12, [rsp+264] + movdqu xmm13, [rsp+280] + movdqu xmm14, [rsp+296] + movdqu xmm15, [rsp+312] + add rsp, 328 pop rsi pop rdi pop r15 @@ -5967,20 +6083,29 @@ AES_GCM_decrypt_final_aesni PROC mov r12, QWORD PTR [rsp+88] mov r14, QWORD PTR [rsp+96] mov rbp, QWORD PTR [rsp+104] - sub rsp, 16 + sub rsp, 160 + movdqu [rsp+16], xmm6 + movdqu [rsp+32], xmm7 + movdqu [rsp+48], xmm8 + movdqu [rsp+64], xmm9 + movdqu [rsp+80], xmm10 + movdqu [rsp+96], xmm11 + movdqu [rsp+112], xmm12 + movdqu [rsp+128], xmm13 + movdqu [rsp+144], xmm15 movdqa xmm6, OWORD PTR [rax] movdqa xmm5, OWORD PTR [r12] movdqa xmm15, OWORD PTR [r14] - movdqa xmm9, xmm5 movdqa xmm8, xmm5 - psrlq xmm9, 63 - psllq xmm8, 1 - pslldq xmm9, 8 - por xmm8, xmm9 + movdqa xmm7, xmm5 + psrlq xmm8, 63 + psllq xmm7, 1 + pslldq xmm8, 8 + por xmm7, xmm8 pshufd xmm5, xmm5, 255 psrad xmm5, 31 pand xmm5, OWORD PTR L_aes_gcm_mod2_128 - pxor xmm5, xmm8 + pxor xmm5, xmm7 mov edx, r10d mov ecx, r11d shl rdx, 3 @@ -5988,46 +6113,46 @@ AES_GCM_decrypt_final_aesni PROC pinsrq xmm0, rdx, 0 pinsrq xmm0, rcx, 1 pxor xmm6, xmm0 - pshufd xmm9, xmm5, 78 - pshufd xmm10, xmm6, 78 - movdqa xmm11, xmm6 - movdqa xmm8, xmm6 - pclmulqdq xmm11, xmm5, 17 - pclmulqdq xmm8, xmm5, 0 - pxor xmm9, xmm5 - pxor xmm10, xmm6 - pclmulqdq xmm9, xmm10, 0 - pxor xmm9, xmm8 - pxor xmm9, xmm11 - movdqa xmm10, xmm9 - movdqa xmm6, xmm11 - pslldq xmm10, 8 - psrldq xmm9, 8 + pshufd xmm8, xmm5, 78 + pshufd xmm9, xmm6, 78 + movdqa xmm10, xmm6 + movdqa xmm7, xmm6 + pclmulqdq xmm10, xmm5, 17 + pclmulqdq xmm7, xmm5, 0 + pxor xmm8, xmm5 + pxor xmm9, xmm6 + pclmulqdq xmm8, xmm9, 0 + pxor xmm8, xmm7 pxor xmm8, xmm10 - pxor xmm6, xmm9 - movdqa xmm12, xmm8 - movdqa xmm13, xmm8 - movdqa xmm14, xmm8 - pslld xmm12, 31 - pslld xmm13, 30 - pslld xmm14, 25 - pxor xmm12, xmm13 - pxor xmm12, xmm14 - movdqa xmm13, xmm12 - psrldq xmm13, 4 - pslldq xmm12, 12 - pxor xmm8, xmm12 - movdqa xmm14, xmm8 - movdqa xmm10, xmm8 movdqa xmm9, xmm8 - psrld xmm14, 1 - psrld xmm10, 2 - psrld xmm9, 7 - pxor xmm14, xmm10 - pxor xmm14, xmm9 - pxor xmm14, xmm13 - pxor xmm14, xmm8 - pxor xmm6, xmm14 + movdqa xmm6, xmm10 + pslldq xmm9, 8 + psrldq xmm8, 8 + pxor xmm7, xmm9 + pxor xmm6, xmm8 + movdqa xmm11, xmm7 + movdqa xmm12, xmm7 + movdqa xmm13, xmm7 + pslld xmm11, 31 + pslld xmm12, 30 + pslld xmm13, 25 + pxor xmm11, xmm12 + pxor xmm11, xmm13 + movdqa xmm12, xmm11 + psrldq xmm12, 4 + pslldq xmm11, 12 + pxor xmm7, xmm11 + movdqa xmm13, xmm7 + movdqa xmm9, xmm7 + movdqa xmm8, xmm7 + psrld xmm13, 1 + psrld xmm9, 2 + psrld xmm8, 7 + pxor xmm13, xmm9 + pxor xmm13, xmm8 + pxor xmm13, xmm12 + pxor xmm13, xmm7 + pxor xmm6, xmm13 pshufb xmm6, OWORD PTR L_aes_gcm_bswap_mask movdqu xmm0, xmm15 pxor xmm0, xmm6 @@ -6059,7 +6184,16 @@ L_AES_GCM_decrypt_final_aesni_cmp_tag_16: sete r15b L_AES_GCM_decrypt_final_aesni_cmp_tag_done: mov DWORD PTR [rbp], r15d - add rsp, 16 + movdqu xmm6, [rsp+16] + movdqu xmm7, [rsp+32] + movdqu xmm8, [rsp+48] + movdqu xmm9, [rsp+64] + movdqu xmm10, [rsp+80] + movdqu xmm11, [rsp+96] + movdqu xmm12, [rsp+112] + movdqu xmm13, [rsp+128] + movdqu xmm15, [rsp+144] + add rsp, 160 pop r15 pop rbp pop r14 @@ -6144,7 +6278,17 @@ AES_GCM_encrypt_avx1 PROC mov r14d, DWORD PTR [rsp+128] mov r15, QWORD PTR [rsp+136] mov r10d, DWORD PTR [rsp+144] - sub rsp, 160 + sub rsp, 320 + vmovdqu OWORD PTR [rsp+160], xmm6 + vmovdqu OWORD PTR [rsp+176], xmm7 + vmovdqu OWORD PTR [rsp+192], xmm8 + vmovdqu OWORD PTR [rsp+208], xmm9 + vmovdqu OWORD PTR [rsp+224], xmm10 + vmovdqu OWORD PTR [rsp+240], xmm11 + vmovdqu OWORD PTR [rsp+256], xmm12 + vmovdqu OWORD PTR [rsp+272], xmm13 + vmovdqu OWORD PTR [rsp+288], xmm14 + vmovdqu OWORD PTR [rsp+304], xmm15 vpxor xmm4, xmm4, xmm4 vpxor xmm6, xmm6, xmm6 mov edx, ebx @@ -7692,7 +7836,17 @@ L_AES_GCM_encrypt_avx1_store_tag_16: vmovdqu OWORD PTR [r8], xmm0 L_AES_GCM_encrypt_avx1_store_tag_done: vzeroupper - add rsp, 160 + vmovdqu xmm6, OWORD PTR [rsp+160] + vmovdqu xmm7, OWORD PTR [rsp+176] + vmovdqu xmm8, OWORD PTR [rsp+192] + vmovdqu xmm9, OWORD PTR [rsp+208] + vmovdqu xmm10, OWORD PTR [rsp+224] + vmovdqu xmm11, OWORD PTR [rsp+240] + vmovdqu xmm12, OWORD PTR [rsp+256] + vmovdqu xmm13, OWORD PTR [rsp+272] + vmovdqu xmm14, OWORD PTR [rsp+288] + vmovdqu xmm15, OWORD PTR [rsp+304] + add rsp, 320 pop r15 pop r14 pop rbx @@ -7725,7 +7879,17 @@ AES_GCM_decrypt_avx1 PROC mov r15, QWORD PTR [rsp+144] mov r10d, DWORD PTR [rsp+152] mov rbp, QWORD PTR [rsp+160] - sub rsp, 168 + sub rsp, 328 + vmovdqu OWORD PTR [rsp+168], xmm6 + vmovdqu OWORD PTR [rsp+184], xmm7 + vmovdqu OWORD PTR [rsp+200], xmm8 + vmovdqu OWORD PTR [rsp+216], xmm9 + vmovdqu OWORD PTR [rsp+232], xmm10 + vmovdqu OWORD PTR [rsp+248], xmm11 + vmovdqu OWORD PTR [rsp+264], xmm12 + vmovdqu OWORD PTR [rsp+280], xmm13 + vmovdqu OWORD PTR [rsp+296], xmm14 + vmovdqu OWORD PTR [rsp+312], xmm15 vpxor xmm4, xmm4, xmm4 vpxor xmm6, xmm6, xmm6 cmp ebx, 12 @@ -8864,7 +9028,17 @@ L_AES_GCM_decrypt_avx1_cmp_tag_16: L_AES_GCM_decrypt_avx1_cmp_tag_done: mov DWORD PTR [rbp], ebx vzeroupper - add rsp, 168 + vmovdqu xmm6, OWORD PTR [rsp+168] + vmovdqu xmm7, OWORD PTR [rsp+184] + vmovdqu xmm8, OWORD PTR [rsp+200] + vmovdqu xmm9, OWORD PTR [rsp+216] + vmovdqu xmm10, OWORD PTR [rsp+232] + vmovdqu xmm11, OWORD PTR [rsp+248] + vmovdqu xmm12, OWORD PTR [rsp+264] + vmovdqu xmm13, OWORD PTR [rsp+280] + vmovdqu xmm14, OWORD PTR [rsp+296] + vmovdqu xmm15, OWORD PTR [rsp+312] + add rsp, 328 pop rbp pop r15 pop r14 @@ -8889,7 +9063,11 @@ AES_GCM_init_avx1 PROC mov rax, QWORD PTR [rsp+72] mov r8, QWORD PTR [rsp+80] mov r9, QWORD PTR [rsp+88] - sub rsp, 16 + sub rsp, 80 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm15 vpxor xmm4, xmm4, xmm4 mov edx, r11d cmp edx, 12 @@ -8903,53 +9081,53 @@ AES_GCM_init_avx1 PROC ; H = Encrypt X(=0) and T = Encrypt counter vmovdqa xmm5, OWORD PTR [rdi] vpxor xmm1, xmm4, xmm5 - vmovdqa xmm7, OWORD PTR [rdi+16] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+32] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+48] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+64] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+80] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+96] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+112] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+128] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+144] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm6, OWORD PTR [rdi+16] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+32] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+48] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+64] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+80] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+96] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+112] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+128] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+144] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 cmp esi, 11 - vmovdqa xmm7, OWORD PTR [rdi+160] + vmovdqa xmm6, OWORD PTR [rdi+160] jl L_AES_GCM_init_avx1_calc_iv_12_last - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+176] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+176] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 cmp esi, 13 - vmovdqa xmm7, OWORD PTR [rdi+192] + vmovdqa xmm6, OWORD PTR [rdi+192] jl L_AES_GCM_init_avx1_calc_iv_12_last - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+208] - vaesenc xmm5, xmm5, xmm7 - vaesenc xmm1, xmm1, xmm7 - vmovdqa xmm7, OWORD PTR [rdi+224] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+208] + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm1, xmm1, xmm6 + vmovdqa xmm6, OWORD PTR [rdi+224] L_AES_GCM_init_avx1_calc_iv_12_last: - vaesenclast xmm5, xmm5, xmm7 - vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm5, xmm5, xmm6 + vaesenclast xmm1, xmm1, xmm6 vpshufb xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask vmovdqu xmm15, xmm1 jmp L_AES_GCM_init_avx1_iv_done @@ -8967,18 +9145,18 @@ L_AES_GCM_init_avx1_iv_not_12: vaesenc xmm5, xmm5, [rdi+128] vaesenc xmm5, xmm5, [rdi+144] cmp esi, 11 - vmovdqa xmm9, OWORD PTR [rdi+160] + vmovdqa xmm8, OWORD PTR [rdi+160] jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last - vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, xmm8 vaesenc xmm5, xmm5, [rdi+176] cmp esi, 13 - vmovdqa xmm9, OWORD PTR [rdi+192] + vmovdqa xmm8, OWORD PTR [rdi+192] jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last - vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, xmm8 vaesenc xmm5, xmm5, [rdi+208] - vmovdqa xmm9, OWORD PTR [rdi+224] + vmovdqa xmm8, OWORD PTR [rdi+224] L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last: - vaesenclast xmm5, xmm5, xmm9 + vaesenclast xmm5, xmm5, xmm8 vpshufb xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask ; Calc counter ; Initialization vector @@ -8989,9 +9167,9 @@ L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last: jl L_AES_GCM_init_avx1_calc_iv_lt16 and edx, 4294967280 L_AES_GCM_init_avx1_calc_iv_16_loop: - vmovdqu xmm8, OWORD PTR [r10+rcx] - vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask - vpxor xmm4, xmm4, xmm8 + vmovdqu xmm7, OWORD PTR [r10+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 ; ghash_gfmul_avx vpshufd xmm1, xmm4, 78 vpshufd xmm2, xmm5, 78 @@ -9002,38 +9180,38 @@ L_AES_GCM_init_avx1_calc_iv_16_loop: vpclmulqdq xmm1, xmm1, xmm2, 0 vpxor xmm1, xmm1, xmm0 vpxor xmm1, xmm1, xmm3 - vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm0 vmovdqa xmm4, xmm3 vpslldq xmm2, xmm1, 8 vpsrldq xmm1, xmm1, 8 - vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm2 vpxor xmm4, xmm4, xmm1 - vpsrld xmm0, xmm7, 31 + vpsrld xmm0, xmm6, 31 vpsrld xmm1, xmm4, 31 - vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 vpslld xmm4, xmm4, 1 vpsrldq xmm2, xmm0, 12 vpslldq xmm0, xmm0, 4 vpslldq xmm1, xmm1, 4 vpor xmm4, xmm4, xmm2 - vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm0 vpor xmm4, xmm4, xmm1 - vpslld xmm0, xmm7, 31 - vpslld xmm1, xmm7, 30 - vpslld xmm2, xmm7, 25 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 vpxor xmm0, xmm0, xmm1 vpxor xmm0, xmm0, xmm2 vmovdqa xmm1, xmm0 vpsrldq xmm1, xmm1, 4 vpslldq xmm0, xmm0, 12 - vpxor xmm7, xmm7, xmm0 - vpsrld xmm2, xmm7, 1 - vpsrld xmm3, xmm7, 2 - vpsrld xmm0, xmm7, 7 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 vpxor xmm2, xmm2, xmm3 vpxor xmm2, xmm2, xmm0 vpxor xmm2, xmm2, xmm1 - vpxor xmm2, xmm2, xmm7 + vpxor xmm2, xmm2, xmm6 vpxor xmm4, xmm4, xmm2 add ecx, 16 cmp ecx, edx @@ -9043,9 +9221,9 @@ L_AES_GCM_init_avx1_calc_iv_16_loop: je L_AES_GCM_init_avx1_calc_iv_done L_AES_GCM_init_avx1_calc_iv_lt16: sub rsp, 16 - vpxor xmm8, xmm8, xmm8 + vpxor xmm7, xmm7, xmm7 xor r13d, r13d - vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp], xmm7 L_AES_GCM_init_avx1_calc_iv_loop: movzx r12d, BYTE PTR [r10+rcx] mov BYTE PTR [rsp+r13], r12b @@ -9053,10 +9231,10 @@ L_AES_GCM_init_avx1_calc_iv_loop: inc r13d cmp ecx, edx jl L_AES_GCM_init_avx1_calc_iv_loop - vmovdqu xmm8, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp] add rsp, 16 - vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask - vpxor xmm4, xmm4, xmm8 + vpshufb xmm7, xmm7, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 ; ghash_gfmul_avx vpshufd xmm1, xmm4, 78 vpshufd xmm2, xmm5, 78 @@ -9067,38 +9245,38 @@ L_AES_GCM_init_avx1_calc_iv_loop: vpclmulqdq xmm1, xmm1, xmm2, 0 vpxor xmm1, xmm1, xmm0 vpxor xmm1, xmm1, xmm3 - vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm0 vmovdqa xmm4, xmm3 vpslldq xmm2, xmm1, 8 vpsrldq xmm1, xmm1, 8 - vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm2 vpxor xmm4, xmm4, xmm1 - vpsrld xmm0, xmm7, 31 + vpsrld xmm0, xmm6, 31 vpsrld xmm1, xmm4, 31 - vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 vpslld xmm4, xmm4, 1 vpsrldq xmm2, xmm0, 12 vpslldq xmm0, xmm0, 4 vpslldq xmm1, xmm1, 4 vpor xmm4, xmm4, xmm2 - vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm0 vpor xmm4, xmm4, xmm1 - vpslld xmm0, xmm7, 31 - vpslld xmm1, xmm7, 30 - vpslld xmm2, xmm7, 25 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 vpxor xmm0, xmm0, xmm1 vpxor xmm0, xmm0, xmm2 vmovdqa xmm1, xmm0 vpsrldq xmm1, xmm1, 4 vpslldq xmm0, xmm0, 12 - vpxor xmm7, xmm7, xmm0 - vpsrld xmm2, xmm7, 1 - vpsrld xmm3, xmm7, 2 - vpsrld xmm0, xmm7, 7 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 vpxor xmm2, xmm2, xmm3 vpxor xmm2, xmm2, xmm0 vpxor xmm2, xmm2, xmm1 - vpxor xmm2, xmm2, xmm7 + vpxor xmm2, xmm2, xmm6 vpxor xmm4, xmm4, xmm2 L_AES_GCM_init_avx1_calc_iv_done: ; T = Encrypt counter @@ -9116,66 +9294,66 @@ L_AES_GCM_init_avx1_calc_iv_done: vpclmulqdq xmm1, xmm1, xmm2, 0 vpxor xmm1, xmm1, xmm0 vpxor xmm1, xmm1, xmm3 - vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm0 vmovdqa xmm4, xmm3 vpslldq xmm2, xmm1, 8 vpsrldq xmm1, xmm1, 8 - vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm2 vpxor xmm4, xmm4, xmm1 - vpsrld xmm0, xmm7, 31 + vpsrld xmm0, xmm6, 31 vpsrld xmm1, xmm4, 31 - vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 vpslld xmm4, xmm4, 1 vpsrldq xmm2, xmm0, 12 vpslldq xmm0, xmm0, 4 vpslldq xmm1, xmm1, 4 vpor xmm4, xmm4, xmm2 - vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm0 vpor xmm4, xmm4, xmm1 - vpslld xmm0, xmm7, 31 - vpslld xmm1, xmm7, 30 - vpslld xmm2, xmm7, 25 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 vpxor xmm0, xmm0, xmm1 vpxor xmm0, xmm0, xmm2 vmovdqa xmm1, xmm0 vpsrldq xmm1, xmm1, 4 vpslldq xmm0, xmm0, 12 - vpxor xmm7, xmm7, xmm0 - vpsrld xmm2, xmm7, 1 - vpsrld xmm3, xmm7, 2 - vpsrld xmm0, xmm7, 7 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 vpxor xmm2, xmm2, xmm3 vpxor xmm2, xmm2, xmm0 vpxor xmm2, xmm2, xmm1 - vpxor xmm2, xmm2, xmm7 + vpxor xmm2, xmm2, xmm6 vpxor xmm4, xmm4, xmm2 vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask ; Encrypt counter - vmovdqa xmm8, OWORD PTR [rdi] - vpxor xmm8, xmm8, xmm4 - vaesenc xmm8, xmm8, [rdi+16] - vaesenc xmm8, xmm8, [rdi+32] - vaesenc xmm8, xmm8, [rdi+48] - vaesenc xmm8, xmm8, [rdi+64] - vaesenc xmm8, xmm8, [rdi+80] - vaesenc xmm8, xmm8, [rdi+96] - vaesenc xmm8, xmm8, [rdi+112] - vaesenc xmm8, xmm8, [rdi+128] - vaesenc xmm8, xmm8, [rdi+144] + vmovdqa xmm7, OWORD PTR [rdi] + vpxor xmm7, xmm7, xmm4 + vaesenc xmm7, xmm7, [rdi+16] + vaesenc xmm7, xmm7, [rdi+32] + vaesenc xmm7, xmm7, [rdi+48] + vaesenc xmm7, xmm7, [rdi+64] + vaesenc xmm7, xmm7, [rdi+80] + vaesenc xmm7, xmm7, [rdi+96] + vaesenc xmm7, xmm7, [rdi+112] + vaesenc xmm7, xmm7, [rdi+128] + vaesenc xmm7, xmm7, [rdi+144] cmp esi, 11 - vmovdqa xmm9, OWORD PTR [rdi+160] + vmovdqa xmm8, OWORD PTR [rdi+160] jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last - vaesenc xmm8, xmm8, xmm9 - vaesenc xmm8, xmm8, [rdi+176] + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+176] cmp esi, 13 - vmovdqa xmm9, OWORD PTR [rdi+192] + vmovdqa xmm8, OWORD PTR [rdi+192] jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last - vaesenc xmm8, xmm8, xmm9 - vaesenc xmm8, xmm8, [rdi+208] - vmovdqa xmm9, OWORD PTR [rdi+224] + vaesenc xmm7, xmm7, xmm8 + vaesenc xmm7, xmm7, [rdi+208] + vmovdqa xmm8, OWORD PTR [rdi+224] L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last: - vaesenclast xmm8, xmm8, xmm9 - vmovdqu xmm15, xmm8 + vaesenclast xmm7, xmm7, xmm8 + vmovdqu xmm15, xmm7 L_AES_GCM_init_avx1_iv_done: vmovdqa OWORD PTR [r9], xmm15 vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64 @@ -9183,7 +9361,11 @@ L_AES_GCM_init_avx1_iv_done: vmovdqa OWORD PTR [rax], xmm5 vmovdqa OWORD PTR [r8], xmm4 vzeroupper - add rsp, 16 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm15, OWORD PTR [rsp+64] + add rsp, 80 pop r13 pop r12 pop rsi @@ -9194,13 +9376,16 @@ _text ENDS _text SEGMENT READONLY PARA AES_GCM_aad_update_avx1 PROC mov rax, rcx + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 vmovdqa xmm5, OWORD PTR [r8] vmovdqa xmm6, OWORD PTR [r9] xor ecx, ecx L_AES_GCM_aad_update_avx1_16_loop: - vmovdqu xmm8, OWORD PTR [rax+rcx] - vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask - vpxor xmm5, xmm5, xmm8 + vmovdqu xmm7, OWORD PTR [rax+rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm7 ; ghash_gfmul_avx vpshufd xmm1, xmm5, 78 vpshufd xmm2, xmm6, 78 @@ -9249,6 +9434,9 @@ L_AES_GCM_aad_update_avx1_16_loop: jl L_AES_GCM_aad_update_avx1_16_loop vmovdqa OWORD PTR [r8], xmm5 vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 ret AES_GCM_aad_update_avx1 ENDP _text ENDS @@ -9257,48 +9445,51 @@ AES_GCM_encrypt_block_avx1 PROC mov r10, r8 mov r11, r9 mov rax, QWORD PTR [rsp+40] - vmovdqu xmm9, OWORD PTR [rax] - vpshufb xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64 - vpaddd xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one - vmovdqu OWORD PTR [rax], xmm9 - vpxor xmm8, xmm8, [rcx] - vaesenc xmm8, xmm8, [rcx+16] - vaesenc xmm8, xmm8, [rcx+32] - vaesenc xmm8, xmm8, [rcx+48] - vaesenc xmm8, xmm8, [rcx+64] - vaesenc xmm8, xmm8, [rcx+80] - vaesenc xmm8, xmm8, [rcx+96] - vaesenc xmm8, xmm8, [rcx+112] - vaesenc xmm8, xmm8, [rcx+128] - vaesenc xmm8, xmm8, [rcx+144] + vmovdqu xmm1, OWORD PTR [rax] + vpshufb xmm0, xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_avx1_aes_gcm_one + vmovdqu OWORD PTR [rax], xmm1 + vpxor xmm0, xmm0, [rcx] + vaesenc xmm0, xmm0, [rcx+16] + vaesenc xmm0, xmm0, [rcx+32] + vaesenc xmm0, xmm0, [rcx+48] + vaesenc xmm0, xmm0, [rcx+64] + vaesenc xmm0, xmm0, [rcx+80] + vaesenc xmm0, xmm0, [rcx+96] + vaesenc xmm0, xmm0, [rcx+112] + vaesenc xmm0, xmm0, [rcx+128] + vaesenc xmm0, xmm0, [rcx+144] cmp edx, 11 - vmovdqa xmm9, OWORD PTR [rcx+160] + vmovdqa xmm1, OWORD PTR [rcx+160] jl L_AES_GCM_encrypt_block_avx1_aesenc_block_last - vaesenc xmm8, xmm8, xmm9 - vaesenc xmm8, xmm8, [rcx+176] + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+176] cmp edx, 13 - vmovdqa xmm9, OWORD PTR [rcx+192] + vmovdqa xmm1, OWORD PTR [rcx+192] jl L_AES_GCM_encrypt_block_avx1_aesenc_block_last - vaesenc xmm8, xmm8, xmm9 - vaesenc xmm8, xmm8, [rcx+208] - vmovdqa xmm9, OWORD PTR [rcx+224] + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [rcx+208] + vmovdqa xmm1, OWORD PTR [rcx+224] L_AES_GCM_encrypt_block_avx1_aesenc_block_last: - vaesenclast xmm8, xmm8, xmm9 - vmovdqu xmm9, OWORD PTR [r11] - vpxor xmm8, xmm8, xmm9 - vmovdqu OWORD PTR [r10], xmm8 - vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [r11] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [r10], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask vzeroupper ret AES_GCM_encrypt_block_avx1 ENDP _text ENDS _text SEGMENT READONLY PARA AES_GCM_ghash_block_avx1 PROC + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 vmovdqa xmm4, OWORD PTR [rdx] vmovdqa xmm5, OWORD PTR [r8] - vmovdqu xmm8, OWORD PTR [rcx] - vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask - vpxor xmm4, xmm4, xmm8 + vmovdqu xmm7, OWORD PTR [rcx] + vpshufb xmm7, xmm7, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm7 ; ghash_gfmul_avx vpshufd xmm1, xmm4, 78 vpshufd xmm2, xmm5, 78 @@ -9344,6 +9535,9 @@ AES_GCM_ghash_block_avx1 PROC vpxor xmm4, xmm4, xmm2 vmovdqa OWORD PTR [rdx], xmm4 vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 ret AES_GCM_ghash_block_avx1 ENDP _text ENDS @@ -9362,7 +9556,17 @@ AES_GCM_encrypt_update_avx1 PROC mov r12, QWORD PTR [rsp+88] mov r14, QWORD PTR [rsp+96] mov r15, QWORD PTR [rsp+104] - sub rsp, 160 + sub rsp, 320 + vmovdqu OWORD PTR [rsp+160], xmm6 + vmovdqu OWORD PTR [rsp+176], xmm7 + vmovdqu OWORD PTR [rsp+192], xmm8 + vmovdqu OWORD PTR [rsp+208], xmm9 + vmovdqu OWORD PTR [rsp+224], xmm10 + vmovdqu OWORD PTR [rsp+240], xmm11 + vmovdqu OWORD PTR [rsp+256], xmm12 + vmovdqu OWORD PTR [rsp+272], xmm13 + vmovdqu OWORD PTR [rsp+288], xmm14 + vmovdqu OWORD PTR [rsp+304], xmm15 vmovdqa xmm6, OWORD PTR [r12] vmovdqa xmm5, OWORD PTR [r14] vpsrlq xmm9, xmm5, 63 @@ -10360,7 +10564,17 @@ L_AES_GCM_encrypt_update_avx1_last_block_done: L_AES_GCM_encrypt_update_avx1_done_enc: vmovdqa OWORD PTR [r12], xmm6 vzeroupper - add rsp, 160 + vmovdqu xmm6, OWORD PTR [rsp+160] + vmovdqu xmm7, OWORD PTR [rsp+176] + vmovdqu xmm8, OWORD PTR [rsp+192] + vmovdqu xmm9, OWORD PTR [rsp+208] + vmovdqu xmm10, OWORD PTR [rsp+224] + vmovdqu xmm11, OWORD PTR [rsp+240] + vmovdqu xmm12, OWORD PTR [rsp+256] + vmovdqu xmm13, OWORD PTR [rsp+272] + vmovdqu xmm14, OWORD PTR [rsp+288] + vmovdqu xmm15, OWORD PTR [rsp+304] + add rsp, 320 pop rdi pop r15 pop r14 @@ -10380,18 +10594,26 @@ AES_GCM_encrypt_final_avx1 PROC mov r11d, DWORD PTR [rsp+64] mov r12, QWORD PTR [rsp+72] mov r14, QWORD PTR [rsp+80] - sub rsp, 16 + sub rsp, 144 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 vmovdqa xmm4, OWORD PTR [rax] vmovdqa xmm5, OWORD PTR [r12] vmovdqa xmm6, OWORD PTR [r14] - vpsrlq xmm9, xmm5, 63 - vpsllq xmm8, xmm5, 1 - vpslldq xmm9, xmm9, 8 - vpor xmm8, xmm8, xmm9 + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 vpshufd xmm5, xmm5, 255 vpsrad xmm5, xmm5, 31 vpand xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128 - vpxor xmm5, xmm5, xmm8 + vpxor xmm5, xmm5, xmm7 mov edx, r10d mov ecx, r11d shl rdx, 3 @@ -10401,35 +10623,35 @@ AES_GCM_encrypt_final_avx1 PROC vpunpcklqdq xmm0, xmm0, xmm1 vpxor xmm4, xmm4, xmm0 ; ghash_gfmul_red_avx - vpshufd xmm9, xmm5, 78 - vpshufd xmm10, xmm4, 78 - vpclmulqdq xmm11, xmm4, xmm5, 17 - vpclmulqdq xmm8, xmm4, xmm5, 0 - vpxor xmm9, xmm9, xmm5 - vpxor xmm10, xmm10, xmm4 - vpclmulqdq xmm9, xmm9, xmm10, 0 - vpxor xmm9, xmm9, xmm8 - vpxor xmm9, xmm9, xmm11 - vpslldq xmm10, xmm9, 8 - vpsrldq xmm9, xmm9, 8 + vpshufd xmm8, xmm5, 78 + vpshufd xmm9, xmm4, 78 + vpclmulqdq xmm10, xmm4, xmm5, 17 + vpclmulqdq xmm7, xmm4, xmm5, 0 + vpxor xmm8, xmm8, xmm5 + vpxor xmm9, xmm9, xmm4 + vpclmulqdq xmm8, xmm8, xmm9, 0 + vpxor xmm8, xmm8, xmm7 vpxor xmm8, xmm8, xmm10 - vpxor xmm4, xmm11, xmm9 - vpslld xmm12, xmm8, 31 - vpslld xmm13, xmm8, 30 - vpslld xmm14, xmm8, 25 - vpxor xmm12, xmm12, xmm13 - vpxor xmm12, xmm12, xmm14 - vpsrldq xmm13, xmm12, 4 - vpslldq xmm12, xmm12, 12 - vpxor xmm8, xmm8, xmm12 - vpsrld xmm14, xmm8, 1 - vpsrld xmm10, xmm8, 2 - vpsrld xmm9, xmm8, 7 - vpxor xmm14, xmm14, xmm10 - vpxor xmm14, xmm14, xmm9 - vpxor xmm14, xmm14, xmm13 - vpxor xmm14, xmm14, xmm8 - vpxor xmm4, xmm4, xmm14 + vpslldq xmm9, xmm8, 8 + vpsrldq xmm8, xmm8, 8 + vpxor xmm7, xmm7, xmm9 + vpxor xmm4, xmm10, xmm8 + vpslld xmm11, xmm7, 31 + vpslld xmm12, xmm7, 30 + vpslld xmm13, xmm7, 25 + vpxor xmm11, xmm11, xmm12 + vpxor xmm11, xmm11, xmm13 + vpsrldq xmm12, xmm11, 4 + vpslldq xmm11, xmm11, 12 + vpxor xmm7, xmm7, xmm11 + vpsrld xmm13, xmm7, 1 + vpsrld xmm9, xmm7, 2 + vpsrld xmm8, xmm7, 7 + vpxor xmm13, xmm13, xmm9 + vpxor xmm13, xmm13, xmm8 + vpxor xmm13, xmm13, xmm12 + vpxor xmm13, xmm13, xmm7 + vpxor xmm4, xmm4, xmm13 vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask vpxor xmm0, xmm4, xmm6 cmp r8d, 16 @@ -10447,7 +10669,15 @@ L_AES_GCM_encrypt_final_avx1_store_tag_16: vmovdqu OWORD PTR [r9], xmm0 L_AES_GCM_encrypt_final_avx1_store_tag_done: vzeroupper - add rsp, 16 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + add rsp, 144 pop r14 pop r12 pop r13 @@ -10469,7 +10699,17 @@ AES_GCM_decrypt_update_avx1 PROC mov r12, QWORD PTR [rsp+88] mov r14, QWORD PTR [rsp+96] mov r15, QWORD PTR [rsp+104] - sub rsp, 168 + sub rsp, 328 + vmovdqu OWORD PTR [rsp+168], xmm6 + vmovdqu OWORD PTR [rsp+184], xmm7 + vmovdqu OWORD PTR [rsp+200], xmm8 + vmovdqu OWORD PTR [rsp+216], xmm9 + vmovdqu OWORD PTR [rsp+232], xmm10 + vmovdqu OWORD PTR [rsp+248], xmm11 + vmovdqu OWORD PTR [rsp+264], xmm12 + vmovdqu OWORD PTR [rsp+280], xmm13 + vmovdqu OWORD PTR [rsp+296], xmm14 + vmovdqu OWORD PTR [rsp+312], xmm15 vmovdqa xmm6, OWORD PTR [r12] vmovdqa xmm5, OWORD PTR [r14] vpsrlq xmm9, xmm5, 63 @@ -11051,7 +11291,17 @@ L_AES_GCM_decrypt_update_avx1_last_block_done: L_AES_GCM_decrypt_update_avx1_done_dec: vmovdqa OWORD PTR [r12], xmm6 vzeroupper - add rsp, 168 + vmovdqu xmm6, OWORD PTR [rsp+168] + vmovdqu xmm7, OWORD PTR [rsp+184] + vmovdqu xmm8, OWORD PTR [rsp+200] + vmovdqu xmm9, OWORD PTR [rsp+216] + vmovdqu xmm10, OWORD PTR [rsp+232] + vmovdqu xmm11, OWORD PTR [rsp+248] + vmovdqu xmm12, OWORD PTR [rsp+264] + vmovdqu xmm13, OWORD PTR [rsp+280] + vmovdqu xmm14, OWORD PTR [rsp+296] + vmovdqu xmm15, OWORD PTR [rsp+312] + add rsp, 328 pop rdi pop r15 pop r14 @@ -11074,18 +11324,27 @@ AES_GCM_decrypt_final_avx1 PROC mov r12, QWORD PTR [rsp+88] mov r14, QWORD PTR [rsp+96] mov rbp, QWORD PTR [rsp+104] - sub rsp, 16 + sub rsp, 160 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 + vmovdqu OWORD PTR [rsp+48], xmm8 + vmovdqu OWORD PTR [rsp+64], xmm9 + vmovdqu OWORD PTR [rsp+80], xmm10 + vmovdqu OWORD PTR [rsp+96], xmm11 + vmovdqu OWORD PTR [rsp+112], xmm12 + vmovdqu OWORD PTR [rsp+128], xmm13 + vmovdqu OWORD PTR [rsp+144], xmm15 vmovdqa xmm6, OWORD PTR [rax] vmovdqa xmm5, OWORD PTR [r12] vmovdqa xmm15, OWORD PTR [r14] - vpsrlq xmm9, xmm5, 63 - vpsllq xmm8, xmm5, 1 - vpslldq xmm9, xmm9, 8 - vpor xmm8, xmm8, xmm9 + vpsrlq xmm8, xmm5, 63 + vpsllq xmm7, xmm5, 1 + vpslldq xmm8, xmm8, 8 + vpor xmm7, xmm7, xmm8 vpshufd xmm5, xmm5, 255 vpsrad xmm5, xmm5, 31 vpand xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128 - vpxor xmm5, xmm5, xmm8 + vpxor xmm5, xmm5, xmm7 mov edx, r10d mov ecx, r11d shl rdx, 3 @@ -11095,35 +11354,35 @@ AES_GCM_decrypt_final_avx1 PROC vpunpcklqdq xmm0, xmm0, xmm1 vpxor xmm6, xmm6, xmm0 ; ghash_gfmul_red_avx - vpshufd xmm9, xmm5, 78 - vpshufd xmm10, xmm6, 78 - vpclmulqdq xmm11, xmm6, xmm5, 17 - vpclmulqdq xmm8, xmm6, xmm5, 0 - vpxor xmm9, xmm9, xmm5 - vpxor xmm10, xmm10, xmm6 - vpclmulqdq xmm9, xmm9, xmm10, 0 - vpxor xmm9, xmm9, xmm8 - vpxor xmm9, xmm9, xmm11 - vpslldq xmm10, xmm9, 8 - vpsrldq xmm9, xmm9, 8 + vpshufd xmm8, xmm5, 78 + vpshufd xmm9, xmm6, 78 + vpclmulqdq xmm10, xmm6, xmm5, 17 + vpclmulqdq xmm7, xmm6, xmm5, 0 + vpxor xmm8, xmm8, xmm5 + vpxor xmm9, xmm9, xmm6 + vpclmulqdq xmm8, xmm8, xmm9, 0 + vpxor xmm8, xmm8, xmm7 vpxor xmm8, xmm8, xmm10 - vpxor xmm6, xmm11, xmm9 - vpslld xmm12, xmm8, 31 - vpslld xmm13, xmm8, 30 - vpslld xmm14, xmm8, 25 - vpxor xmm12, xmm12, xmm13 - vpxor xmm12, xmm12, xmm14 - vpsrldq xmm13, xmm12, 4 - vpslldq xmm12, xmm12, 12 - vpxor xmm8, xmm8, xmm12 - vpsrld xmm14, xmm8, 1 - vpsrld xmm10, xmm8, 2 - vpsrld xmm9, xmm8, 7 - vpxor xmm14, xmm14, xmm10 - vpxor xmm14, xmm14, xmm9 - vpxor xmm14, xmm14, xmm13 - vpxor xmm14, xmm14, xmm8 - vpxor xmm6, xmm6, xmm14 + vpslldq xmm9, xmm8, 8 + vpsrldq xmm8, xmm8, 8 + vpxor xmm7, xmm7, xmm9 + vpxor xmm6, xmm10, xmm8 + vpslld xmm11, xmm7, 31 + vpslld xmm12, xmm7, 30 + vpslld xmm13, xmm7, 25 + vpxor xmm11, xmm11, xmm12 + vpxor xmm11, xmm11, xmm13 + vpsrldq xmm12, xmm11, 4 + vpslldq xmm11, xmm11, 12 + vpxor xmm7, xmm7, xmm11 + vpsrld xmm13, xmm7, 1 + vpsrld xmm9, xmm7, 2 + vpsrld xmm8, xmm7, 7 + vpxor xmm13, xmm13, xmm9 + vpxor xmm13, xmm13, xmm8 + vpxor xmm13, xmm13, xmm12 + vpxor xmm13, xmm13, xmm7 + vpxor xmm6, xmm6, xmm13 vpshufb xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask vpxor xmm0, xmm6, xmm15 cmp r8d, 16 @@ -11155,7 +11414,16 @@ L_AES_GCM_decrypt_final_avx1_cmp_tag_16: L_AES_GCM_decrypt_final_avx1_cmp_tag_done: mov DWORD PTR [rbp], r15d vzeroupper - add rsp, 16 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm8, OWORD PTR [rsp+48] + vmovdqu xmm9, OWORD PTR [rsp+64] + vmovdqu xmm10, OWORD PTR [rsp+80] + vmovdqu xmm11, OWORD PTR [rsp+96] + vmovdqu xmm12, OWORD PTR [rsp+112] + vmovdqu xmm13, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 pop r15 pop rbp pop r14 @@ -11246,7 +11514,17 @@ AES_GCM_encrypt_avx2 PROC mov r14d, DWORD PTR [rsp+128] mov rsi, QWORD PTR [rsp+136] mov r9d, DWORD PTR [rsp+144] - sub rsp, 160 + sub rsp, 320 + vmovdqu OWORD PTR [rsp+160], xmm6 + vmovdqu OWORD PTR [rsp+176], xmm7 + vmovdqu OWORD PTR [rsp+192], xmm8 + vmovdqu OWORD PTR [rsp+208], xmm9 + vmovdqu OWORD PTR [rsp+224], xmm10 + vmovdqu OWORD PTR [rsp+240], xmm11 + vmovdqu OWORD PTR [rsp+256], xmm12 + vmovdqu OWORD PTR [rsp+272], xmm13 + vmovdqu OWORD PTR [rsp+288], xmm14 + vmovdqu OWORD PTR [rsp+304], xmm15 vpxor xmm4, xmm4, xmm4 vpxor xmm6, xmm6, xmm6 mov edx, ebx @@ -12517,7 +12795,17 @@ L_AES_GCM_encrypt_avx2_store_tag_16: vmovdqu OWORD PTR [r15], xmm0 L_AES_GCM_encrypt_avx2_store_tag_done: vzeroupper - add rsp, 160 + vmovdqu xmm6, OWORD PTR [rsp+160] + vmovdqu xmm7, OWORD PTR [rsp+176] + vmovdqu xmm8, OWORD PTR [rsp+192] + vmovdqu xmm9, OWORD PTR [rsp+208] + vmovdqu xmm10, OWORD PTR [rsp+224] + vmovdqu xmm11, OWORD PTR [rsp+240] + vmovdqu xmm12, OWORD PTR [rsp+256] + vmovdqu xmm13, OWORD PTR [rsp+272] + vmovdqu xmm14, OWORD PTR [rsp+288] + vmovdqu xmm15, OWORD PTR [rsp+304] + add rsp, 320 pop rsi pop r14 pop rbx @@ -12550,7 +12838,17 @@ AES_GCM_decrypt_avx2 PROC mov rsi, QWORD PTR [rsp+144] mov r9d, DWORD PTR [rsp+152] mov rbp, QWORD PTR [rsp+160] - sub rsp, 168 + sub rsp, 328 + vmovdqu OWORD PTR [rsp+168], xmm6 + vmovdqu OWORD PTR [rsp+184], xmm7 + vmovdqu OWORD PTR [rsp+200], xmm8 + vmovdqu OWORD PTR [rsp+216], xmm9 + vmovdqu OWORD PTR [rsp+232], xmm10 + vmovdqu OWORD PTR [rsp+248], xmm11 + vmovdqu OWORD PTR [rsp+264], xmm12 + vmovdqu OWORD PTR [rsp+280], xmm13 + vmovdqu OWORD PTR [rsp+296], xmm14 + vmovdqu OWORD PTR [rsp+312], xmm15 vpxor xmm4, xmm4, xmm4 vpxor xmm6, xmm6, xmm6 mov edx, ebx @@ -13481,7 +13779,17 @@ L_AES_GCM_decrypt_avx2_cmp_tag_16: L_AES_GCM_decrypt_avx2_cmp_tag_done: mov DWORD PTR [rbp], eax vzeroupper - add rsp, 168 + vmovdqu xmm6, OWORD PTR [rsp+168] + vmovdqu xmm7, OWORD PTR [rsp+184] + vmovdqu xmm8, OWORD PTR [rsp+200] + vmovdqu xmm9, OWORD PTR [rsp+216] + vmovdqu xmm10, OWORD PTR [rsp+232] + vmovdqu xmm11, OWORD PTR [rsp+248] + vmovdqu xmm12, OWORD PTR [rsp+264] + vmovdqu xmm13, OWORD PTR [rsp+280] + vmovdqu xmm14, OWORD PTR [rsp+296] + vmovdqu xmm15, OWORD PTR [rsp+312] + add rsp, 328 pop rbp pop rsi pop r15 @@ -13506,7 +13814,9 @@ AES_GCM_init_avx2 PROC mov rax, QWORD PTR [rsp+72] mov r8, QWORD PTR [rsp+80] mov r9, QWORD PTR [rsp+88] - sub rsp, 16 + sub rsp, 48 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 vpxor xmm4, xmm4, xmm4 mov edx, r11d cmp edx, 12 @@ -13755,7 +14065,9 @@ L_AES_GCM_init_avx2_iv_done: vmovdqu OWORD PTR [rax], xmm5 vmovdqu OWORD PTR [r8], xmm4 vzeroupper - add rsp, 16 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + add rsp, 48 pop r12 pop rsi pop rdi @@ -13766,6 +14078,8 @@ _text ENDS _text SEGMENT READONLY PARA AES_GCM_aad_update_avx2 PROC mov rax, rcx + sub rsp, 16 + vmovdqu OWORD PTR [rsp], xmm6 vmovdqu xmm4, OWORD PTR [r8] vmovdqu xmm5, OWORD PTR [r9] xor ecx, ecx @@ -13808,6 +14122,8 @@ L_AES_GCM_aad_update_avx2_16_loop: jl L_AES_GCM_aad_update_avx2_16_loop vmovdqu OWORD PTR [r8], xmm4 vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + add rsp, 16 ret AES_GCM_aad_update_avx2 ENDP _text ENDS @@ -13868,6 +14184,8 @@ AES_GCM_encrypt_block_avx2 ENDP _text ENDS _text SEGMENT READONLY PARA AES_GCM_ghash_block_avx2 PROC + sub rsp, 16 + vmovdqu OWORD PTR [rsp], xmm6 vmovdqu xmm4, OWORD PTR [rdx] vmovdqu xmm5, OWORD PTR [r8] vmovdqu xmm0, OWORD PTR [rcx] @@ -13905,6 +14223,8 @@ AES_GCM_ghash_block_avx2 PROC vpxor xmm4, xmm4, xmm1 vmovdqu OWORD PTR [rdx], xmm4 vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + add rsp, 16 ret AES_GCM_ghash_block_avx2 ENDP _text ENDS @@ -13923,7 +14243,17 @@ AES_GCM_encrypt_update_avx2 PROC mov r12, QWORD PTR [rsp+88] mov r13, QWORD PTR [rsp+96] mov r14, QWORD PTR [rsp+104] - sub rsp, 152 + sub rsp, 312 + vmovdqu OWORD PTR [rsp+152], xmm6 + vmovdqu OWORD PTR [rsp+168], xmm7 + vmovdqu OWORD PTR [rsp+184], xmm8 + vmovdqu OWORD PTR [rsp+200], xmm9 + vmovdqu OWORD PTR [rsp+216], xmm10 + vmovdqu OWORD PTR [rsp+232], xmm11 + vmovdqu OWORD PTR [rsp+248], xmm12 + vmovdqu OWORD PTR [rsp+264], xmm13 + vmovdqu OWORD PTR [rsp+280], xmm14 + vmovdqu OWORD PTR [rsp+296], xmm15 vmovdqu xmm6, OWORD PTR [r12] vmovdqu xmm5, OWORD PTR [r13] vmovdqu xmm4, OWORD PTR [r14] @@ -14754,7 +15084,17 @@ L_AES_GCM_encrypt_update_avx2_done_enc: vmovdqu OWORD PTR [r12], xmm6 vmovdqu OWORD PTR [r14], xmm4 vzeroupper - add rsp, 152 + vmovdqu xmm6, OWORD PTR [rsp+152] + vmovdqu xmm7, OWORD PTR [rsp+168] + vmovdqu xmm8, OWORD PTR [rsp+184] + vmovdqu xmm9, OWORD PTR [rsp+200] + vmovdqu xmm10, OWORD PTR [rsp+216] + vmovdqu xmm11, OWORD PTR [rsp+232] + vmovdqu xmm12, OWORD PTR [rsp+248] + vmovdqu xmm13, OWORD PTR [rsp+264] + vmovdqu xmm14, OWORD PTR [rsp+280] + vmovdqu xmm15, OWORD PTR [rsp+296] + add rsp, 312 pop rdi pop r15 pop r14 @@ -14770,7 +15110,9 @@ AES_GCM_encrypt_final_avx2 PROC mov eax, DWORD PTR [rsp+56] mov r10, QWORD PTR [rsp+64] mov r11, QWORD PTR [rsp+72] - sub rsp, 16 + sub rsp, 48 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 vmovdqu xmm4, OWORD PTR [rcx] vmovdqu xmm5, OWORD PTR [r10] vmovdqu xmm6, OWORD PTR [r11] @@ -14824,7 +15166,9 @@ L_AES_GCM_encrypt_final_avx2_store_tag_16: vmovdqu OWORD PTR [rdx], xmm0 L_AES_GCM_encrypt_final_avx2_store_tag_done: vzeroupper - add rsp, 16 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + add rsp, 48 pop r13 pop r12 ret @@ -14845,7 +15189,17 @@ AES_GCM_decrypt_update_avx2 PROC mov r12, QWORD PTR [rsp+88] mov r14, QWORD PTR [rsp+96] mov r15, QWORD PTR [rsp+104] - sub rsp, 168 + sub rsp, 328 + vmovdqu OWORD PTR [rsp+168], xmm6 + vmovdqu OWORD PTR [rsp+184], xmm7 + vmovdqu OWORD PTR [rsp+200], xmm8 + vmovdqu OWORD PTR [rsp+216], xmm9 + vmovdqu OWORD PTR [rsp+232], xmm10 + vmovdqu OWORD PTR [rsp+248], xmm11 + vmovdqu OWORD PTR [rsp+264], xmm12 + vmovdqu OWORD PTR [rsp+280], xmm13 + vmovdqu OWORD PTR [rsp+296], xmm14 + vmovdqu OWORD PTR [rsp+312], xmm15 vmovdqu xmm6, OWORD PTR [r12] vmovdqu xmm5, OWORD PTR [r14] vmovdqu xmm4, OWORD PTR [r15] @@ -15329,7 +15683,17 @@ L_AES_GCM_decrypt_update_avx2_done_dec: vmovdqu OWORD PTR [r12], xmm6 vmovdqu OWORD PTR [r15], xmm4 vzeroupper - add rsp, 168 + vmovdqu xmm6, OWORD PTR [rsp+168] + vmovdqu xmm7, OWORD PTR [rsp+184] + vmovdqu xmm8, OWORD PTR [rsp+200] + vmovdqu xmm9, OWORD PTR [rsp+216] + vmovdqu xmm10, OWORD PTR [rsp+232] + vmovdqu xmm11, OWORD PTR [rsp+248] + vmovdqu xmm12, OWORD PTR [rsp+264] + vmovdqu xmm13, OWORD PTR [rsp+280] + vmovdqu xmm14, OWORD PTR [rsp+296] + vmovdqu xmm15, OWORD PTR [rsp+312] + add rsp, 328 pop rdi pop r15 pop r14 @@ -15347,7 +15711,9 @@ AES_GCM_decrypt_final_avx2 PROC mov r10, QWORD PTR [rsp+72] mov r11, QWORD PTR [rsp+80] mov r12, QWORD PTR [rsp+88] - sub rsp, 16 + sub rsp, 48 + vmovdqu OWORD PTR [rsp+16], xmm6 + vmovdqu OWORD PTR [rsp+32], xmm7 vmovdqu xmm4, OWORD PTR [rcx] vmovdqu xmm5, OWORD PTR [r10] vmovdqu xmm6, OWORD PTR [r11] @@ -15412,7 +15778,9 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_16: L_AES_GCM_decrypt_final_avx2_cmp_tag_done: mov DWORD PTR [r12], r10d vzeroupper - add rsp, 16 + vmovdqu xmm6, OWORD PTR [rsp+16] + vmovdqu xmm7, OWORD PTR [rsp+32] + add rsp, 48 pop r14 pop r13 pop r12