diff --git a/wolfcrypt/src/aes_gcm_x86_asm.S b/wolfcrypt/src/aes_gcm_x86_asm.S new file mode 100644 index 000000000..8a384996e --- /dev/null +++ b/wolfcrypt/src/aes_gcm_x86_asm.S @@ -0,0 +1,12962 @@ +/* aes_gcm_x86_asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef WOLFSSL_USER_SETTINGS +#include "wolfssl/wolfcrypt/settings.h" +#endif + +#ifndef HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX1 +#endif /* HAVE_INTEL_AVX1 */ +#ifndef NO_AVX2_SUPPORT +#define HAVE_INTEL_AVX2 +#endif /* NO_AVX2_SUPPORT */ + +.type data, @object +L_aes_gcm_one: +.long 0x0,0x0,0x1,0x0 +.type data, @object +L_aes_gcm_two: +.long 0x0,0x0,0x2,0x0 +.type data, @object +L_aes_gcm_three: +.long 0x0,0x0,0x3,0x0 +.type data, @object +L_aes_gcm_four: +.long 0x0,0x0,0x4,0x0 +.type data, @object +L_aes_gcm_bswap_epi64: +.long 0x4050607,0x10203,0xc0d0e0f,0x8090a0b +.type data, @object +L_aes_gcm_bswap_mask: +.long 0xc0d0e0f,0x8090a0b,0x4050607,0x10203 +.type data, @object +L_aes_gcm_mod2_128: +.long 0x1,0x0,0x0,0xc2000000 +.type data, @object +L_aes_gcm_avx1_one: +.long 0x0,0x0,0x1,0x0 +.type data, @object +L_aes_gcm_avx1_two: +.long 0x0,0x0,0x2,0x0 +.type data, @object +L_aes_gcm_avx1_three: +.long 0x0,0x0,0x3,0x0 +.type data, @object +L_aes_gcm_avx1_four: +.long 0x0,0x0,0x4,0x0 +.type data, @object +L_aes_gcm_avx1_bswap_epi64: +.long 0x4050607,0x10203,0xc0d0e0f,0x8090a0b +.type data, @object +L_aes_gcm_avx1_bswap_mask: +.long 0xc0d0e0f,0x8090a0b,0x4050607,0x10203 +.type data, @object +L_aes_gcm_avx1_mod2_128: +.long 0x1,0x0,0x0,0xc2000000 +.type data, @object +L_aes_gcm_avx2_one: +.long 0x0,0x0,0x1,0x0 +.type data, @object +L_aes_gcm_avx2_two: +.long 0x0,0x0,0x2,0x0 +.type data, @object +L_aes_gcm_avx2_three: +.long 0x0,0x0,0x3,0x0 +.type data, @object +L_aes_gcm_avx2_four: +.long 0x0,0x0,0x4,0x0 +.type data, @object +L_avx2_aes_gcm_bswap_one: +.long 0x0,0x0,0x0,0x1000000 +.type data, @object +L_aes_gcm_avx2_bswap_epi64: +.long 0x4050607,0x10203,0xc0d0e0f,0x8090a0b +.type data, @object +L_aes_gcm_avx2_bswap_mask: +.long 0xc0d0e0f,0x8090a0b,0x4050607,0x10203 +.type data, @object +L_aes_gcm_avx2_mod2_128: +.long 0x1,0x0,0x0,0xc2000000 +.text +.globl AES_GCM_encrypt_aesni +.type AES_GCM_encrypt_aesni,@function +.align 16 +AES_GCM_encrypt_aesni: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0x70, %esp + movl 144(%esp), %esi + movl 168(%esp), %ebp + movl 160(%esp), %edx + pxor %xmm0, %xmm0 + pxor %xmm2, %xmm2 + cmpl $12, %edx + jne L_AES_GCM_encrypt_aesni_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + pinsrd $0x00, (%esi), %xmm0 + pinsrd $0x01, 4(%esi), %xmm0 + pinsrd $2, 8(%esi), %xmm0 + pinsrd $3, %ecx, %xmm0 + # H = Encrypt X(=0) and T = Encrypt counter + movdqa %xmm0, %xmm5 + movdqa (%ebp), %xmm1 + pxor %xmm1, %xmm5 + movdqa 16(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 32(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 48(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 64(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 80(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 96(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 112(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 128(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 144(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + cmpl $11, 172(%esp) + movdqa 160(%ebp), %xmm3 + jl L_AES_GCM_encrypt_aesni_calc_iv_12_last + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 176(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + cmpl $13, 172(%esp) + movdqa 192(%ebp), %xmm3 + jl L_AES_GCM_encrypt_aesni_calc_iv_12_last + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 208(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 224(%ebp), %xmm3 +L_AES_GCM_encrypt_aesni_calc_iv_12_last: + aesenclast %xmm3, %xmm1 + aesenclast %xmm3, %xmm5 + pshufb L_aes_gcm_bswap_mask, %xmm1 + movdqu %xmm5, 80(%esp) + jmp L_AES_GCM_encrypt_aesni_iv_done +L_AES_GCM_encrypt_aesni_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + movdqa (%ebp), %xmm1 + aesenc 16(%ebp), %xmm1 + aesenc 32(%ebp), %xmm1 + aesenc 48(%ebp), %xmm1 + aesenc 64(%ebp), %xmm1 + aesenc 80(%ebp), %xmm1 + aesenc 96(%ebp), %xmm1 + aesenc 112(%ebp), %xmm1 + aesenc 128(%ebp), %xmm1 + aesenc 144(%ebp), %xmm1 + cmpl $11, 172(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last + aesenc %xmm5, %xmm1 + aesenc 176(%ebp), %xmm1 + cmpl $13, 172(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last + aesenc %xmm5, %xmm1 + aesenc 208(%ebp), %xmm1 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last: + aesenclast %xmm5, %xmm1 + pshufb L_aes_gcm_bswap_mask, %xmm1 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_encrypt_aesni_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_encrypt_aesni_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_aesni_calc_iv_16_loop: + movdqu (%esi,%ecx,1), %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm0 + pshufd $0x4e, %xmm0, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm0, %xmm4 + pxor %xmm0, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm0 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm0, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm0 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm0 + por %xmm4, %xmm3 + por %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm0 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_aesni_calc_iv_16_loop + movl 160(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_aesni_calc_iv_done +L_AES_GCM_encrypt_aesni_calc_iv_lt16: + subl $16, %esp + pxor %xmm4, %xmm4 + xorl %ebx, %ebx + movdqu %xmm4, (%esp) +L_AES_GCM_encrypt_aesni_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_aesni_calc_iv_loop + movdqu (%esp), %xmm4 + addl $16, %esp + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm0 + pshufd $0x4e, %xmm0, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm0, %xmm4 + pxor %xmm0, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm0 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm0, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm0 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm0 + por %xmm4, %xmm3 + por %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm0 +L_AES_GCM_encrypt_aesni_calc_iv_done: + # T = Encrypt counter + pxor %xmm4, %xmm4 + shll $3, %edx + pinsrd $0x00, %edx, %xmm4 + pxor %xmm4, %xmm0 + pshufd $0x4e, %xmm0, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm0, %xmm4 + pxor %xmm0, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm0 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm0, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm0 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm0 + por %xmm4, %xmm3 + por %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm0 + # Encrypt counter + movdqa (%ebp), %xmm4 + pxor %xmm0, %xmm4 + aesenc 16(%ebp), %xmm4 + aesenc 32(%ebp), %xmm4 + aesenc 48(%ebp), %xmm4 + aesenc 64(%ebp), %xmm4 + aesenc 80(%ebp), %xmm4 + aesenc 96(%ebp), %xmm4 + aesenc 112(%ebp), %xmm4 + aesenc 128(%ebp), %xmm4 + aesenc 144(%ebp), %xmm4 + cmpl $11, 172(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last + aesenc %xmm5, %xmm4 + aesenc 176(%ebp), %xmm4 + cmpl $13, 172(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last + aesenc %xmm5, %xmm4 + aesenc 208(%ebp), %xmm4 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last: + aesenclast %xmm5, %xmm4 + movdqu %xmm4, 80(%esp) +L_AES_GCM_encrypt_aesni_iv_done: + movl 140(%esp), %esi + # Additional authentication data + movl 156(%esp), %edx + cmpl $0x00, %edx + je L_AES_GCM_encrypt_aesni_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_encrypt_aesni_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_aesni_calc_aad_16_loop: + movdqu (%esi,%ecx,1), %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm2 + pshufd $0x4e, %xmm2, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm2, %xmm7 + pclmulqdq $0x00, %xmm2, %xmm4 + pxor %xmm2, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm3, %xmm4 + movdqa %xmm2, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm2 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm2 + por %xmm4, %xmm3 + por %xmm5, %xmm2 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm2 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_aesni_calc_aad_16_loop + movl 156(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_aesni_calc_aad_done +L_AES_GCM_encrypt_aesni_calc_aad_lt16: + subl $16, %esp + pxor %xmm4, %xmm4 + xorl %ebx, %ebx + movdqu %xmm4, (%esp) +L_AES_GCM_encrypt_aesni_calc_aad_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_aesni_calc_aad_loop + movdqu (%esp), %xmm4 + addl $16, %esp + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm2 + pshufd $0x4e, %xmm2, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm2, %xmm7 + pclmulqdq $0x00, %xmm2, %xmm4 + pxor %xmm2, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm3, %xmm4 + movdqa %xmm2, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm2 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm2 + por %xmm4, %xmm3 + por %xmm5, %xmm2 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm2 +L_AES_GCM_encrypt_aesni_calc_aad_done: + movdqu %xmm2, 96(%esp) + movl 132(%esp), %esi + movl 136(%esp), %edi + # Calculate counter and H + pshufb L_aes_gcm_bswap_epi64, %xmm0 + movdqa %xmm1, %xmm5 + paddd L_aes_gcm_one, %xmm0 + movdqa %xmm1, %xmm4 + movdqu %xmm0, 64(%esp) + psrlq $63, %xmm5 + psllq $0x01, %xmm4 + pslldq $8, %xmm5 + por %xmm5, %xmm4 + pshufd $0xff, %xmm1, %xmm1 + psrad $31, %xmm1 + pand L_aes_gcm_mod2_128, %xmm1 + pxor %xmm4, %xmm1 + xorl %ebx, %ebx + movl 152(%esp), %eax + cmpl $0x40, %eax + jl L_AES_GCM_encrypt_aesni_done_64 + andl $0xffffffc0, %eax + movdqa %xmm2, %xmm6 + # H ^ 1 + movdqu %xmm1, (%esp) + # H ^ 2 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm0 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm0 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm0 + movdqu %xmm0, 16(%esp) + # H ^ 3 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm0, %xmm6 + movdqa %xmm0, %xmm7 + movdqa %xmm0, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm0, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm3 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm3 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm3 + movdqu %xmm3, 32(%esp) + # H ^ 4 + pshufd $0x4e, %xmm0, %xmm5 + pshufd $0x4e, %xmm0, %xmm6 + movdqa %xmm0, %xmm7 + movdqa %xmm0, %xmm4 + pclmulqdq $0x11, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm0, %xmm4 + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm3 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm3 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm3 + movdqu %xmm3, 48(%esp) + # First 64 bytes of input + # Encrypt 64 bytes of counter + movdqu 64(%esp), %xmm4 + movdqa L_aes_gcm_bswap_epi64, %xmm3 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pshufb %xmm3, %xmm4 + paddd L_aes_gcm_one, %xmm5 + pshufb %xmm3, %xmm5 + paddd L_aes_gcm_two, %xmm6 + pshufb %xmm3, %xmm6 + paddd L_aes_gcm_three, %xmm7 + pshufb %xmm3, %xmm7 + movdqu 64(%esp), %xmm3 + paddd L_aes_gcm_four, %xmm3 + movdqu %xmm3, 64(%esp) + movdqa (%ebp), %xmm3 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm3, %xmm7 + movdqa 16(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 32(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 48(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 64(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 80(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 96(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 112(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 128(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 144(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + cmpl $11, 172(%esp) + movdqa 160(%ebp), %xmm3 + jl L_AES_GCM_encrypt_aesni_enc_done + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 176(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + cmpl $13, 172(%esp) + movdqa 192(%ebp), %xmm3 + jl L_AES_GCM_encrypt_aesni_enc_done + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 208(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 224(%ebp), %xmm3 +L_AES_GCM_encrypt_aesni_enc_done: + aesenclast %xmm3, %xmm4 + aesenclast %xmm3, %xmm5 + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm1 + pxor %xmm0, %xmm4 + pxor %xmm1, %xmm5 + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) + aesenclast %xmm3, %xmm6 + aesenclast %xmm3, %xmm7 + movdqu 32(%esi), %xmm0 + movdqu 48(%esi), %xmm1 + pxor %xmm0, %xmm6 + pxor %xmm1, %xmm7 + movdqu %xmm6, 32(%edi) + movdqu %xmm7, 48(%edi) + cmpl $0x40, %eax + movl $0x40, %ebx + movl %esi, %ecx + movl %edi, %edx + jle L_AES_GCM_encrypt_aesni_end_64 + # More 64 bytes of input +L_AES_GCM_encrypt_aesni_ghash_64: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # Encrypt 64 bytes of counter + movdqu 64(%esp), %xmm4 + movdqa L_aes_gcm_bswap_epi64, %xmm3 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pshufb %xmm3, %xmm4 + paddd L_aes_gcm_one, %xmm5 + pshufb %xmm3, %xmm5 + paddd L_aes_gcm_two, %xmm6 + pshufb %xmm3, %xmm6 + paddd L_aes_gcm_three, %xmm7 + pshufb %xmm3, %xmm7 + movdqu 64(%esp), %xmm3 + paddd L_aes_gcm_four, %xmm3 + movdqu %xmm3, 64(%esp) + movdqa (%ebp), %xmm3 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm3, %xmm7 + movdqa 16(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 32(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 48(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 64(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 80(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 96(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 112(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 128(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 144(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + cmpl $11, 172(%esp) + movdqa 160(%ebp), %xmm3 + jl L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 176(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + cmpl $13, 172(%esp) + movdqa 192(%ebp), %xmm3 + jl L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 208(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 224(%ebp), %xmm3 +L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done: + aesenclast %xmm3, %xmm4 + aesenclast %xmm3, %xmm5 + movdqu (%ecx), %xmm0 + movdqu 16(%ecx), %xmm1 + pxor %xmm0, %xmm4 + pxor %xmm1, %xmm5 + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + aesenclast %xmm3, %xmm6 + aesenclast %xmm3, %xmm7 + movdqu 32(%ecx), %xmm0 + movdqu 48(%ecx), %xmm1 + pxor %xmm0, %xmm6 + pxor %xmm1, %xmm7 + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + # ghash encrypted counter + movdqu 96(%esp), %xmm6 + movdqu 48(%esp), %xmm3 + movdqu -64(%edx), %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm6, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm3, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm7 + pclmulqdq $0x11, %xmm3, %xmm7 + movdqa %xmm4, %xmm6 + pclmulqdq $0x00, %xmm3, %xmm6 + pclmulqdq $0x00, %xmm1, %xmm5 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqu 32(%esp), %xmm3 + movdqu -48(%edx), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqu 16(%esp), %xmm3 + movdqu -32(%edx), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqu (%esp), %xmm3 + movdqu -16(%edx), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqa %xmm5, %xmm1 + psrldq $8, %xmm5 + pslldq $8, %xmm1 + pxor %xmm1, %xmm6 + pxor %xmm5, %xmm7 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + pslld $31, %xmm3 + pslld $30, %xmm0 + pslld $25, %xmm1 + pxor %xmm0, %xmm3 + pxor %xmm1, %xmm3 + movdqa %xmm3, %xmm0 + pslldq $12, %xmm3 + psrldq $4, %xmm0 + pxor %xmm3, %xmm6 + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm5 + movdqa %xmm6, %xmm4 + psrld $0x01, %xmm1 + psrld $2, %xmm5 + psrld $7, %xmm4 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm6 + pxor %xmm7, %xmm6 + movdqu %xmm6, 96(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_aesni_ghash_64 +L_AES_GCM_encrypt_aesni_end_64: + movdqu 96(%esp), %xmm2 + # Block 1 + movdqa L_aes_gcm_bswap_mask, %xmm4 + movdqu (%edx), %xmm1 + pshufb %xmm4, %xmm1 + movdqu 48(%esp), %xmm3 + pxor %xmm2, %xmm1 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm3, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm0 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm0 + pxor %xmm5, %xmm2 + # Block 2 + movdqa L_aes_gcm_bswap_mask, %xmm4 + movdqu 16(%edx), %xmm1 + pshufb %xmm4, %xmm1 + movdqu 32(%esp), %xmm3 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm3, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + pxor %xmm4, %xmm0 + pxor %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm0 + pxor %xmm5, %xmm2 + # Block 3 + movdqa L_aes_gcm_bswap_mask, %xmm4 + movdqu 32(%edx), %xmm1 + pshufb %xmm4, %xmm1 + movdqu 16(%esp), %xmm3 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm3, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + pxor %xmm4, %xmm0 + pxor %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm0 + pxor %xmm5, %xmm2 + # Block 4 + movdqa L_aes_gcm_bswap_mask, %xmm4 + movdqu 48(%edx), %xmm1 + pshufb %xmm4, %xmm1 + movdqu (%esp), %xmm3 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm3, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + pxor %xmm4, %xmm0 + pxor %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm0 + movdqa %xmm0, %xmm6 + movdqa %xmm0, %xmm7 + movdqa %xmm0, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm0, %xmm6 + pxor %xmm6, %xmm2 + movdqu (%esp), %xmm1 +L_AES_GCM_encrypt_aesni_done_64: + movl 152(%esp), %edx + cmpl %edx, %ebx + jge L_AES_GCM_encrypt_aesni_done_enc + movl 152(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_aesni_last_block_done + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + movdqu 64(%esp), %xmm4 + movdqa %xmm4, %xmm5 + pshufb L_aes_gcm_bswap_epi64, %xmm4 + paddd L_aes_gcm_one, %xmm5 + pxor (%ebp), %xmm4 + movdqu %xmm5, 64(%esp) + aesenc 16(%ebp), %xmm4 + aesenc 32(%ebp), %xmm4 + aesenc 48(%ebp), %xmm4 + aesenc 64(%ebp), %xmm4 + aesenc 80(%ebp), %xmm4 + aesenc 96(%ebp), %xmm4 + aesenc 112(%ebp), %xmm4 + aesenc 128(%ebp), %xmm4 + aesenc 144(%ebp), %xmm4 + cmpl $11, 172(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last + aesenc %xmm5, %xmm4 + aesenc 176(%ebp), %xmm4 + cmpl $13, 172(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last + aesenc %xmm5, %xmm4 + aesenc 208(%ebp), %xmm4 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last: + aesenclast %xmm5, %xmm4 + movdqu (%ecx), %xmm5 + pxor %xmm5, %xmm4 + movdqu %xmm4, (%edx) + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm2 + addl $16, %ebx + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_aesni_last_block_ghash +L_AES_GCM_encrypt_aesni_last_block_start: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + movdqu 64(%esp), %xmm4 + movdqa %xmm4, %xmm5 + pshufb L_aes_gcm_bswap_epi64, %xmm4 + paddd L_aes_gcm_one, %xmm5 + pxor (%ebp), %xmm4 + movdqu %xmm5, 64(%esp) + movdqu %xmm2, %xmm0 + pclmulqdq $16, %xmm1, %xmm0 + aesenc 16(%ebp), %xmm4 + aesenc 32(%ebp), %xmm4 + movdqu %xmm2, %xmm3 + pclmulqdq $0x01, %xmm1, %xmm3 + aesenc 48(%ebp), %xmm4 + aesenc 64(%ebp), %xmm4 + aesenc 80(%ebp), %xmm4 + movdqu %xmm2, %xmm5 + pclmulqdq $0x11, %xmm1, %xmm5 + aesenc 96(%ebp), %xmm4 + pxor %xmm3, %xmm0 + movdqa %xmm0, %xmm6 + psrldq $8, %xmm0 + pslldq $8, %xmm6 + aesenc 112(%ebp), %xmm4 + movdqu %xmm2, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm3 + pxor %xmm3, %xmm6 + pxor %xmm0, %xmm5 + movdqa L_aes_gcm_mod2_128, %xmm7 + movdqa %xmm6, %xmm3 + pclmulqdq $16, %xmm7, %xmm3 + aesenc 128(%ebp), %xmm4 + pshufd $0x4e, %xmm6, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, %xmm3 + pclmulqdq $16, %xmm7, %xmm3 + aesenc 144(%ebp), %xmm4 + pshufd $0x4e, %xmm0, %xmm2 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + cmpl $11, 172(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_aesenc_gfmul_last + aesenc %xmm5, %xmm4 + aesenc 176(%ebp), %xmm4 + cmpl $13, 172(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_aesenc_gfmul_last + aesenc %xmm5, %xmm4 + aesenc 208(%ebp), %xmm4 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_aesni_aesenc_gfmul_last: + aesenclast %xmm5, %xmm4 + movdqu (%ecx), %xmm5 + pxor %xmm5, %xmm4 + movdqu %xmm4, (%edx) + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm2 + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_aesni_last_block_start +L_AES_GCM_encrypt_aesni_last_block_ghash: + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm2, %xmm6 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm2, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm2 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm2 +L_AES_GCM_encrypt_aesni_last_block_done: + movl 152(%esp), %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_done + movdqu 64(%esp), %xmm0 + pshufb L_aes_gcm_bswap_epi64, %xmm0 + pxor (%ebp), %xmm0 + aesenc 16(%ebp), %xmm0 + aesenc 32(%ebp), %xmm0 + aesenc 48(%ebp), %xmm0 + aesenc 64(%ebp), %xmm0 + aesenc 80(%ebp), %xmm0 + aesenc 96(%ebp), %xmm0 + aesenc 112(%ebp), %xmm0 + aesenc 128(%ebp), %xmm0 + aesenc 144(%ebp), %xmm0 + cmpl $11, 172(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last + aesenc %xmm5, %xmm0 + aesenc 176(%ebp), %xmm0 + cmpl $13, 172(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last + aesenc %xmm5, %xmm0 + aesenc 208(%ebp), %xmm0 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last: + aesenclast %xmm5, %xmm0 + subl $16, %esp + xorl %ecx, %ecx + movdqu %xmm0, (%esp) +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_loop: + movzbl (%esi,%ebx,1), %eax + xorb (%esp,%ecx,1), %al + movb %al, (%edi,%ebx,1) + movb %al, (%esp,%ecx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_loop + xorl %eax, %eax + cmpl $16, %ecx + je L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_byte_loop: + movb %al, (%esp,%ecx,1) + incl %ecx + cmpl $16, %ecx + jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_finish_enc: + movdqu (%esp), %xmm0 + addl $16, %esp + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm0, %xmm2 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm2, %xmm6 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm2, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm2 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm2 +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_aesni_done_enc: + movl 148(%esp), %edi + movl 164(%esp), %ebx + movl 152(%esp), %edx + movl 156(%esp), %ecx + shll $3, %edx + shll $3, %ecx + pinsrd $0x00, %edx, %xmm4 + pinsrd $2, %ecx, %xmm4 + movl 152(%esp), %edx + movl 156(%esp), %ecx + shrl $29, %edx + shrl $29, %ecx + pinsrd $0x01, %edx, %xmm4 + pinsrd $3, %ecx, %xmm4 + pxor %xmm4, %xmm2 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm2, %xmm6 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm2, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm2 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm2 + pshufb L_aes_gcm_bswap_mask, %xmm2 + movdqu 80(%esp), %xmm4 + pxor %xmm2, %xmm4 + cmpl $16, %ebx + je L_AES_GCM_encrypt_aesni_store_tag_16 + xorl %ecx, %ecx + movdqu %xmm4, (%esp) +L_AES_GCM_encrypt_aesni_store_tag_loop: + movzbl (%esp,%ecx,1), %eax + movb %al, (%edi,%ecx,1) + incl %ecx + cmpl %ebx, %ecx + jne L_AES_GCM_encrypt_aesni_store_tag_loop + jmp L_AES_GCM_encrypt_aesni_store_tag_done +L_AES_GCM_encrypt_aesni_store_tag_16: + movdqu %xmm4, (%edi) +L_AES_GCM_encrypt_aesni_store_tag_done: + addl $0x70, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_encrypt_aesni,.-AES_GCM_encrypt_aesni +.text +.globl AES_GCM_decrypt_aesni +.type AES_GCM_decrypt_aesni,@function +.align 16 +AES_GCM_decrypt_aesni: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0xb0, %esp + movl 208(%esp), %esi + movl 232(%esp), %ebp + movl 224(%esp), %edx + pxor %xmm0, %xmm0 + pxor %xmm2, %xmm2 + cmpl $12, %edx + jne L_AES_GCM_decrypt_aesni_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + pinsrd $0x00, (%esi), %xmm0 + pinsrd $0x01, 4(%esi), %xmm0 + pinsrd $2, 8(%esi), %xmm0 + pinsrd $3, %ecx, %xmm0 + # H = Encrypt X(=0) and T = Encrypt counter + movdqa %xmm0, %xmm5 + movdqa (%ebp), %xmm1 + pxor %xmm1, %xmm5 + movdqa 16(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 32(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 48(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 64(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 80(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 96(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 112(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 128(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 144(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + cmpl $11, 236(%esp) + movdqa 160(%ebp), %xmm3 + jl L_AES_GCM_decrypt_aesni_calc_iv_12_last + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 176(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + cmpl $13, 236(%esp) + movdqa 192(%ebp), %xmm3 + jl L_AES_GCM_decrypt_aesni_calc_iv_12_last + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 208(%ebp), %xmm3 + aesenc %xmm3, %xmm1 + aesenc %xmm3, %xmm5 + movdqa 224(%ebp), %xmm3 +L_AES_GCM_decrypt_aesni_calc_iv_12_last: + aesenclast %xmm3, %xmm1 + aesenclast %xmm3, %xmm5 + pshufb L_aes_gcm_bswap_mask, %xmm1 + movdqu %xmm5, 80(%esp) + jmp L_AES_GCM_decrypt_aesni_iv_done +L_AES_GCM_decrypt_aesni_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + movdqa (%ebp), %xmm1 + aesenc 16(%ebp), %xmm1 + aesenc 32(%ebp), %xmm1 + aesenc 48(%ebp), %xmm1 + aesenc 64(%ebp), %xmm1 + aesenc 80(%ebp), %xmm1 + aesenc 96(%ebp), %xmm1 + aesenc 112(%ebp), %xmm1 + aesenc 128(%ebp), %xmm1 + aesenc 144(%ebp), %xmm1 + cmpl $11, 236(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last + aesenc %xmm5, %xmm1 + aesenc 176(%ebp), %xmm1 + cmpl $13, 236(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last + aesenc %xmm5, %xmm1 + aesenc 208(%ebp), %xmm1 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last: + aesenclast %xmm5, %xmm1 + pshufb L_aes_gcm_bswap_mask, %xmm1 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_decrypt_aesni_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_decrypt_aesni_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_aesni_calc_iv_16_loop: + movdqu (%esi,%ecx,1), %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm0 + pshufd $0x4e, %xmm0, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm0, %xmm4 + pxor %xmm0, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm0 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm0, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm0 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm0 + por %xmm4, %xmm3 + por %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm0 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_aesni_calc_iv_16_loop + movl 224(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_aesni_calc_iv_done +L_AES_GCM_decrypt_aesni_calc_iv_lt16: + subl $16, %esp + pxor %xmm4, %xmm4 + xorl %ebx, %ebx + movdqu %xmm4, (%esp) +L_AES_GCM_decrypt_aesni_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_aesni_calc_iv_loop + movdqu (%esp), %xmm4 + addl $16, %esp + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm0 + pshufd $0x4e, %xmm0, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm0, %xmm4 + pxor %xmm0, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm0 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm0, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm0 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm0 + por %xmm4, %xmm3 + por %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm0 +L_AES_GCM_decrypt_aesni_calc_iv_done: + # T = Encrypt counter + pxor %xmm4, %xmm4 + shll $3, %edx + pinsrd $0x00, %edx, %xmm4 + pxor %xmm4, %xmm0 + pshufd $0x4e, %xmm0, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm0, %xmm4 + pxor %xmm0, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm0 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm0, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm0 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm0 + por %xmm4, %xmm3 + por %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm0 + # Encrypt counter + movdqa (%ebp), %xmm4 + pxor %xmm0, %xmm4 + aesenc 16(%ebp), %xmm4 + aesenc 32(%ebp), %xmm4 + aesenc 48(%ebp), %xmm4 + aesenc 64(%ebp), %xmm4 + aesenc 80(%ebp), %xmm4 + aesenc 96(%ebp), %xmm4 + aesenc 112(%ebp), %xmm4 + aesenc 128(%ebp), %xmm4 + aesenc 144(%ebp), %xmm4 + cmpl $11, 236(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last + aesenc %xmm5, %xmm4 + aesenc 176(%ebp), %xmm4 + cmpl $13, 236(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last + aesenc %xmm5, %xmm4 + aesenc 208(%ebp), %xmm4 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last: + aesenclast %xmm5, %xmm4 + movdqu %xmm4, 80(%esp) +L_AES_GCM_decrypt_aesni_iv_done: + movl 204(%esp), %esi + # Additional authentication data + movl 220(%esp), %edx + cmpl $0x00, %edx + je L_AES_GCM_decrypt_aesni_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_decrypt_aesni_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_aesni_calc_aad_16_loop: + movdqu (%esi,%ecx,1), %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm2 + pshufd $0x4e, %xmm2, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm2, %xmm7 + pclmulqdq $0x00, %xmm2, %xmm4 + pxor %xmm2, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm3, %xmm4 + movdqa %xmm2, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm2 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm2 + por %xmm4, %xmm3 + por %xmm5, %xmm2 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm2 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_aesni_calc_aad_16_loop + movl 220(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_aesni_calc_aad_done +L_AES_GCM_decrypt_aesni_calc_aad_lt16: + subl $16, %esp + pxor %xmm4, %xmm4 + xorl %ebx, %ebx + movdqu %xmm4, (%esp) +L_AES_GCM_decrypt_aesni_calc_aad_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_aesni_calc_aad_loop + movdqu (%esp), %xmm4 + addl $16, %esp + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm4, %xmm2 + pshufd $0x4e, %xmm2, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm2, %xmm7 + pclmulqdq $0x00, %xmm2, %xmm4 + pxor %xmm2, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm4, %xmm3 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm3, %xmm4 + movdqa %xmm2, %xmm5 + psrld $31, %xmm4 + psrld $31, %xmm5 + pslld $0x01, %xmm3 + pslld $0x01, %xmm2 + movdqa %xmm4, %xmm6 + pslldq $4, %xmm4 + psrldq $12, %xmm6 + pslldq $4, %xmm5 + por %xmm6, %xmm2 + por %xmm4, %xmm3 + por %xmm5, %xmm2 + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm5 + movdqa %xmm3, %xmm6 + pslld $31, %xmm4 + pslld $30, %xmm5 + pslld $25, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + movdqa %xmm4, %xmm5 + psrldq $4, %xmm5 + pslldq $12, %xmm4 + pxor %xmm4, %xmm3 + movdqa %xmm3, %xmm6 + movdqa %xmm3, %xmm7 + movdqa %xmm3, %xmm4 + psrld $0x01, %xmm6 + psrld $2, %xmm7 + psrld $7, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm4, %xmm6 + pxor %xmm5, %xmm6 + pxor %xmm3, %xmm6 + pxor %xmm6, %xmm2 +L_AES_GCM_decrypt_aesni_calc_aad_done: + movdqu %xmm2, 96(%esp) + movl 196(%esp), %esi + movl 200(%esp), %edi + # Calculate counter and H + pshufb L_aes_gcm_bswap_epi64, %xmm0 + movdqa %xmm1, %xmm5 + paddd L_aes_gcm_one, %xmm0 + movdqa %xmm1, %xmm4 + movdqu %xmm0, 64(%esp) + psrlq $63, %xmm5 + psllq $0x01, %xmm4 + pslldq $8, %xmm5 + por %xmm5, %xmm4 + pshufd $0xff, %xmm1, %xmm1 + psrad $31, %xmm1 + pand L_aes_gcm_mod2_128, %xmm1 + pxor %xmm4, %xmm1 + xorl %ebx, %ebx + cmpl $0x40, 216(%esp) + movl 216(%esp), %eax + jl L_AES_GCM_decrypt_aesni_done_64 + andl $0xffffffc0, %eax + movdqa %xmm2, %xmm6 + # H ^ 1 + movdqu %xmm1, (%esp) + # H ^ 2 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm1, %xmm6 + movdqa %xmm1, %xmm7 + movdqa %xmm1, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm0 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm0 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm0 + movdqu %xmm0, 16(%esp) + # H ^ 3 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm0, %xmm6 + movdqa %xmm0, %xmm7 + movdqa %xmm0, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm0, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm3 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm3 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm3 + movdqu %xmm3, 32(%esp) + # H ^ 4 + pshufd $0x4e, %xmm0, %xmm5 + pshufd $0x4e, %xmm0, %xmm6 + movdqa %xmm0, %xmm7 + movdqa %xmm0, %xmm4 + pclmulqdq $0x11, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm0, %xmm4 + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm3 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm3 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm3 + movdqu %xmm3, 48(%esp) + cmpl %esi, %edi + jne L_AES_GCM_decrypt_aesni_ghash_64 +L_AES_GCM_decrypt_aesni_ghash_64_inplace: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # Encrypt 64 bytes of counter + movdqu 64(%esp), %xmm4 + movdqa L_aes_gcm_bswap_epi64, %xmm3 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pshufb %xmm3, %xmm4 + paddd L_aes_gcm_one, %xmm5 + pshufb %xmm3, %xmm5 + paddd L_aes_gcm_two, %xmm6 + pshufb %xmm3, %xmm6 + paddd L_aes_gcm_three, %xmm7 + pshufb %xmm3, %xmm7 + movdqu 64(%esp), %xmm3 + paddd L_aes_gcm_four, %xmm3 + movdqu %xmm3, 64(%esp) + movdqa (%ebp), %xmm3 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm3, %xmm7 + movdqa 16(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 32(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 48(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 64(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 80(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 96(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 112(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 128(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 144(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + cmpl $11, 236(%esp) + movdqa 160(%ebp), %xmm3 + jl L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 176(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + cmpl $13, 236(%esp) + movdqa 192(%ebp), %xmm3 + jl L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 208(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 224(%ebp), %xmm3 +L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done: + aesenclast %xmm3, %xmm4 + aesenclast %xmm3, %xmm5 + movdqu (%ecx), %xmm0 + movdqu 16(%ecx), %xmm1 + pxor %xmm0, %xmm4 + pxor %xmm1, %xmm5 + movdqu %xmm0, 112(%esp) + movdqu %xmm1, 128(%esp) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + aesenclast %xmm3, %xmm6 + aesenclast %xmm3, %xmm7 + movdqu 32(%ecx), %xmm0 + movdqu 48(%ecx), %xmm1 + pxor %xmm0, %xmm6 + pxor %xmm1, %xmm7 + movdqu %xmm0, 144(%esp) + movdqu %xmm1, 160(%esp) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + # ghash encrypted counter + movdqu 96(%esp), %xmm6 + movdqu 48(%esp), %xmm3 + movdqu 112(%esp), %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm6, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm3, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm7 + pclmulqdq $0x11, %xmm3, %xmm7 + movdqa %xmm4, %xmm6 + pclmulqdq $0x00, %xmm3, %xmm6 + pclmulqdq $0x00, %xmm1, %xmm5 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqu 32(%esp), %xmm3 + movdqu 128(%esp), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqu 16(%esp), %xmm3 + movdqu 144(%esp), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqu (%esp), %xmm3 + movdqu 160(%esp), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqa %xmm5, %xmm1 + psrldq $8, %xmm5 + pslldq $8, %xmm1 + pxor %xmm1, %xmm6 + pxor %xmm5, %xmm7 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + pslld $31, %xmm3 + pslld $30, %xmm0 + pslld $25, %xmm1 + pxor %xmm0, %xmm3 + pxor %xmm1, %xmm3 + movdqa %xmm3, %xmm0 + pslldq $12, %xmm3 + psrldq $4, %xmm0 + pxor %xmm3, %xmm6 + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm5 + movdqa %xmm6, %xmm4 + psrld $0x01, %xmm1 + psrld $2, %xmm5 + psrld $7, %xmm4 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm6 + pxor %xmm7, %xmm6 + movdqu %xmm6, 96(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_aesni_ghash_64_inplace + jmp L_AES_GCM_decrypt_aesni_ghash_64_done +L_AES_GCM_decrypt_aesni_ghash_64: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # Encrypt 64 bytes of counter + movdqu 64(%esp), %xmm4 + movdqa L_aes_gcm_bswap_epi64, %xmm3 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pshufb %xmm3, %xmm4 + paddd L_aes_gcm_one, %xmm5 + pshufb %xmm3, %xmm5 + paddd L_aes_gcm_two, %xmm6 + pshufb %xmm3, %xmm6 + paddd L_aes_gcm_three, %xmm7 + pshufb %xmm3, %xmm7 + movdqu 64(%esp), %xmm3 + paddd L_aes_gcm_four, %xmm3 + movdqu %xmm3, 64(%esp) + movdqa (%ebp), %xmm3 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm3, %xmm7 + movdqa 16(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 32(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 48(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 64(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 80(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 96(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 112(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 128(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 144(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + cmpl $11, 236(%esp) + movdqa 160(%ebp), %xmm3 + jl L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 176(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + cmpl $13, 236(%esp) + movdqa 192(%ebp), %xmm3 + jl L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 208(%ebp), %xmm3 + aesenc %xmm3, %xmm4 + aesenc %xmm3, %xmm5 + aesenc %xmm3, %xmm6 + aesenc %xmm3, %xmm7 + movdqa 224(%ebp), %xmm3 +L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done: + aesenclast %xmm3, %xmm4 + aesenclast %xmm3, %xmm5 + movdqu (%ecx), %xmm0 + movdqu 16(%ecx), %xmm1 + pxor %xmm0, %xmm4 + pxor %xmm1, %xmm5 + movdqu %xmm0, (%ecx) + movdqu %xmm1, 16(%ecx) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + aesenclast %xmm3, %xmm6 + aesenclast %xmm3, %xmm7 + movdqu 32(%ecx), %xmm0 + movdqu 48(%ecx), %xmm1 + pxor %xmm0, %xmm6 + pxor %xmm1, %xmm7 + movdqu %xmm0, 32(%ecx) + movdqu %xmm1, 48(%ecx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + # ghash encrypted counter + movdqu 96(%esp), %xmm6 + movdqu 48(%esp), %xmm3 + movdqu (%ecx), %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm6, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm3, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm7 + pclmulqdq $0x11, %xmm3, %xmm7 + movdqa %xmm4, %xmm6 + pclmulqdq $0x00, %xmm3, %xmm6 + pclmulqdq $0x00, %xmm1, %xmm5 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqu 32(%esp), %xmm3 + movdqu 16(%ecx), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqu 16(%esp), %xmm3 + movdqu 32(%ecx), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqu (%esp), %xmm3 + movdqu 48(%ecx), %xmm4 + pshufd $0x4e, %xmm3, %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm4 + pxor %xmm3, %xmm0 + pshufd $0x4e, %xmm4, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pclmulqdq $0x11, %xmm3, %xmm2 + pclmulqdq $0x00, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm1, %xmm0 + pxor %xmm3, %xmm5 + pxor %xmm3, %xmm6 + pxor %xmm2, %xmm5 + pxor %xmm2, %xmm7 + pxor %xmm0, %xmm5 + movdqa %xmm5, %xmm1 + psrldq $8, %xmm5 + pslldq $8, %xmm1 + pxor %xmm1, %xmm6 + pxor %xmm5, %xmm7 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + pslld $31, %xmm3 + pslld $30, %xmm0 + pslld $25, %xmm1 + pxor %xmm0, %xmm3 + pxor %xmm1, %xmm3 + movdqa %xmm3, %xmm0 + pslldq $12, %xmm3 + psrldq $4, %xmm0 + pxor %xmm3, %xmm6 + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm5 + movdqa %xmm6, %xmm4 + psrld $0x01, %xmm1 + psrld $2, %xmm5 + psrld $7, %xmm4 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm6 + pxor %xmm7, %xmm6 + movdqu %xmm6, 96(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_aesni_ghash_64 +L_AES_GCM_decrypt_aesni_ghash_64_done: + movdqa %xmm6, %xmm2 + movdqu (%esp), %xmm1 +L_AES_GCM_decrypt_aesni_done_64: + movl 216(%esp), %edx + cmpl %edx, %ebx + jge L_AES_GCM_decrypt_aesni_done_dec + movl 216(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_decrypt_aesni_last_block_done +L_AES_GCM_decrypt_aesni_last_block_start: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + movdqu (%ecx), %xmm5 + pshufb L_aes_gcm_bswap_mask, %xmm5 + pxor %xmm2, %xmm5 + movdqu %xmm5, (%esp) + movdqu 64(%esp), %xmm4 + movdqa %xmm4, %xmm5 + pshufb L_aes_gcm_bswap_epi64, %xmm4 + paddd L_aes_gcm_one, %xmm5 + pxor (%ebp), %xmm4 + movdqu %xmm5, 64(%esp) + movdqu (%esp), %xmm0 + pclmulqdq $16, %xmm1, %xmm0 + aesenc 16(%ebp), %xmm4 + aesenc 32(%ebp), %xmm4 + movdqu (%esp), %xmm3 + pclmulqdq $0x01, %xmm1, %xmm3 + aesenc 48(%ebp), %xmm4 + aesenc 64(%ebp), %xmm4 + aesenc 80(%ebp), %xmm4 + movdqu (%esp), %xmm5 + pclmulqdq $0x11, %xmm1, %xmm5 + aesenc 96(%ebp), %xmm4 + pxor %xmm3, %xmm0 + movdqa %xmm0, %xmm6 + psrldq $8, %xmm0 + pslldq $8, %xmm6 + aesenc 112(%ebp), %xmm4 + movdqu (%esp), %xmm3 + pclmulqdq $0x00, %xmm1, %xmm3 + pxor %xmm3, %xmm6 + pxor %xmm0, %xmm5 + movdqa L_aes_gcm_mod2_128, %xmm7 + movdqa %xmm6, %xmm3 + pclmulqdq $16, %xmm7, %xmm3 + aesenc 128(%ebp), %xmm4 + pshufd $0x4e, %xmm6, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, %xmm3 + pclmulqdq $16, %xmm7, %xmm3 + aesenc 144(%ebp), %xmm4 + pshufd $0x4e, %xmm0, %xmm2 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + cmpl $11, 236(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_decrypt_aesni_aesenc_gfmul_last + aesenc %xmm5, %xmm4 + aesenc 176(%ebp), %xmm4 + cmpl $13, 236(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_decrypt_aesni_aesenc_gfmul_last + aesenc %xmm5, %xmm4 + aesenc 208(%ebp), %xmm4 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_decrypt_aesni_aesenc_gfmul_last: + aesenclast %xmm5, %xmm4 + movdqu (%ecx), %xmm5 + pxor %xmm5, %xmm4 + movdqu %xmm4, (%edx) + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_aesni_last_block_start +L_AES_GCM_decrypt_aesni_last_block_done: + movl 216(%esp), %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_done + movdqu 64(%esp), %xmm0 + pshufb L_aes_gcm_bswap_epi64, %xmm0 + pxor (%ebp), %xmm0 + aesenc 16(%ebp), %xmm0 + aesenc 32(%ebp), %xmm0 + aesenc 48(%ebp), %xmm0 + aesenc 64(%ebp), %xmm0 + aesenc 80(%ebp), %xmm0 + aesenc 96(%ebp), %xmm0 + aesenc 112(%ebp), %xmm0 + aesenc 128(%ebp), %xmm0 + aesenc 144(%ebp), %xmm0 + cmpl $11, 236(%esp) + movdqa 160(%ebp), %xmm5 + jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last + aesenc %xmm5, %xmm0 + aesenc 176(%ebp), %xmm0 + cmpl $13, 236(%esp) + movdqa 192(%ebp), %xmm5 + jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last + aesenc %xmm5, %xmm0 + aesenc 208(%ebp), %xmm0 + movdqa 224(%ebp), %xmm5 +L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last: + aesenclast %xmm5, %xmm0 + subl $32, %esp + xorl %ecx, %ecx + movdqu %xmm0, (%esp) + pxor %xmm4, %xmm4 + movdqu %xmm4, 16(%esp) +L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop: + movzbl (%esi,%ebx,1), %eax + movb %al, 16(%esp,%ecx,1) + xorb (%esp,%ecx,1), %al + movb %al, (%edi,%ebx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop + movdqu 16(%esp), %xmm0 + addl $32, %esp + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm0, %xmm2 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm2, %xmm6 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm2, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm2 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm2 +L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_aesni_done_dec: + movl 212(%esp), %esi + movl 228(%esp), %ebp + movl 216(%esp), %edx + movl 220(%esp), %ecx + shll $3, %edx + shll $3, %ecx + pinsrd $0x00, %edx, %xmm4 + pinsrd $2, %ecx, %xmm4 + movl 216(%esp), %edx + movl 220(%esp), %ecx + shrl $29, %edx + shrl $29, %ecx + pinsrd $0x01, %edx, %xmm4 + pinsrd $3, %ecx, %xmm4 + pxor %xmm4, %xmm2 + pshufd $0x4e, %xmm1, %xmm5 + pshufd $0x4e, %xmm2, %xmm6 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm4 + pclmulqdq $0x11, %xmm1, %xmm7 + pclmulqdq $0x00, %xmm1, %xmm4 + pxor %xmm1, %xmm5 + pxor %xmm2, %xmm6 + pclmulqdq $0x00, %xmm6, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm6 + movdqa %xmm7, %xmm2 + pslldq $8, %xmm6 + psrldq $8, %xmm5 + pxor %xmm6, %xmm4 + pxor %xmm5, %xmm2 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + movdqa %xmm4, %xmm7 + pslld $31, %xmm5 + pslld $30, %xmm6 + pslld $25, %xmm7 + pxor %xmm6, %xmm5 + pxor %xmm7, %xmm5 + movdqa %xmm5, %xmm7 + psrldq $4, %xmm7 + pslldq $12, %xmm5 + pxor %xmm5, %xmm4 + movdqa %xmm4, %xmm5 + movdqa %xmm4, %xmm6 + psrld $0x01, %xmm5 + psrld $2, %xmm6 + pxor %xmm6, %xmm5 + pxor %xmm4, %xmm5 + psrld $7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm2 + pshufb L_aes_gcm_bswap_mask, %xmm2 + movdqu 80(%esp), %xmm4 + pxor %xmm2, %xmm4 + movl 240(%esp), %edi + cmpl $16, %ebp + je L_AES_GCM_decrypt_aesni_cmp_tag_16 + subl $16, %esp + xorl %ecx, %ecx + xorl %ebx, %ebx + movdqu %xmm4, (%esp) +L_AES_GCM_decrypt_aesni_cmp_tag_loop: + movzbl (%esp,%ecx,1), %eax + xorb (%esi,%ecx,1), %al + orb %al, %bl + incl %ecx + cmpl %ebp, %ecx + jne L_AES_GCM_decrypt_aesni_cmp_tag_loop + cmpb $0x00, %bl + sete %bl + addl $16, %esp + xorl %ecx, %ecx + jmp L_AES_GCM_decrypt_aesni_cmp_tag_done +L_AES_GCM_decrypt_aesni_cmp_tag_16: + movdqu (%esi), %xmm5 + pcmpeqb %xmm5, %xmm4 + pmovmskb %xmm4, %edx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ebx, %ebx + cmpl $0xffff, %edx + sete %bl +L_AES_GCM_decrypt_aesni_cmp_tag_done: + movl %ebx, (%edi) + addl $0xb0, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_aesni,.-AES_GCM_decrypt_aesni +#ifdef WOLFSSL_AESGCM_STREAM +.text +.globl AES_GCM_init_aesni +.type AES_GCM_init_aesni,@function +.align 16 +AES_GCM_init_aesni: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $16, %esp + movl 36(%esp), %ebp + movl 44(%esp), %esi + movl 60(%esp), %edi + pxor %xmm4, %xmm4 + movl 48(%esp), %edx + cmpl $12, %edx + jne L_AES_GCM_init_aesni_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + pinsrd $0x00, (%esi), %xmm4 + pinsrd $0x01, 4(%esi), %xmm4 + pinsrd $2, 8(%esi), %xmm4 + pinsrd $3, %ecx, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + movdqa %xmm4, %xmm1 + movdqa (%ebp), %xmm5 + pxor %xmm5, %xmm1 + movdqa 16(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 32(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 48(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 64(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 80(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 96(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 112(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 128(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 144(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + cmpl $11, 40(%esp) + movdqa 160(%ebp), %xmm7 + jl L_AES_GCM_init_aesni_calc_iv_12_last + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 176(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + cmpl $13, 40(%esp) + movdqa 192(%ebp), %xmm7 + jl L_AES_GCM_init_aesni_calc_iv_12_last + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 208(%ebp), %xmm7 + aesenc %xmm7, %xmm5 + aesenc %xmm7, %xmm1 + movdqa 224(%ebp), %xmm7 +L_AES_GCM_init_aesni_calc_iv_12_last: + aesenclast %xmm7, %xmm5 + aesenclast %xmm7, %xmm1 + pshufb L_aes_gcm_bswap_mask, %xmm5 + movdqu %xmm1, (%edi) + jmp L_AES_GCM_init_aesni_iv_done +L_AES_GCM_init_aesni_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + movdqa (%ebp), %xmm5 + aesenc 16(%ebp), %xmm5 + aesenc 32(%ebp), %xmm5 + aesenc 48(%ebp), %xmm5 + aesenc 64(%ebp), %xmm5 + aesenc 80(%ebp), %xmm5 + aesenc 96(%ebp), %xmm5 + aesenc 112(%ebp), %xmm5 + aesenc 128(%ebp), %xmm5 + aesenc 144(%ebp), %xmm5 + cmpl $11, 40(%esp) + movdqa 160(%ebp), %xmm1 + jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last + aesenc %xmm1, %xmm5 + aesenc 176(%ebp), %xmm5 + cmpl $13, 40(%esp) + movdqa 192(%ebp), %xmm1 + jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last + aesenc %xmm1, %xmm5 + aesenc 208(%ebp), %xmm5 + movdqa 224(%ebp), %xmm1 +L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last: + aesenclast %xmm1, %xmm5 + pshufb L_aes_gcm_bswap_mask, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_init_aesni_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_init_aesni_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_init_aesni_calc_iv_16_loop: + movdqu (%esi,%ecx,1), %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm0, %xmm4 + pshufd $0x4e, %xmm4, %xmm1 + pshufd $0x4e, %xmm5, %xmm2 + movdqa %xmm5, %xmm3 + movdqa %xmm5, %xmm0 + pclmulqdq $0x11, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm0, %xmm7 + movdqa %xmm3, %xmm4 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm4 + movdqa %xmm7, %xmm0 + movdqa %xmm4, %xmm1 + psrld $31, %xmm0 + psrld $31, %xmm1 + pslld $0x01, %xmm7 + pslld $0x01, %xmm4 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm0 + psrldq $12, %xmm2 + pslldq $4, %xmm1 + por %xmm2, %xmm4 + por %xmm0, %xmm7 + por %xmm1, %xmm4 + movdqa %xmm7, %xmm0 + movdqa %xmm7, %xmm1 + movdqa %xmm7, %xmm2 + pslld $31, %xmm0 + pslld $30, %xmm1 + pslld $25, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, %xmm1 + psrldq $4, %xmm1 + pslldq $12, %xmm0 + pxor %xmm0, %xmm7 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + movdqa %xmm7, %xmm0 + psrld $0x01, %xmm2 + psrld $2, %xmm3 + psrld $7, %xmm0 + pxor %xmm3, %xmm2 + pxor %xmm0, %xmm2 + pxor %xmm1, %xmm2 + pxor %xmm7, %xmm2 + pxor %xmm2, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_init_aesni_calc_iv_16_loop + movl 48(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_init_aesni_calc_iv_done +L_AES_GCM_init_aesni_calc_iv_lt16: + subl $16, %esp + pxor %xmm0, %xmm0 + xorl %ebx, %ebx + movdqu %xmm0, (%esp) +L_AES_GCM_init_aesni_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_init_aesni_calc_iv_loop + movdqu (%esp), %xmm0 + addl $16, %esp + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm0, %xmm4 + pshufd $0x4e, %xmm4, %xmm1 + pshufd $0x4e, %xmm5, %xmm2 + movdqa %xmm5, %xmm3 + movdqa %xmm5, %xmm0 + pclmulqdq $0x11, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm0, %xmm7 + movdqa %xmm3, %xmm4 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm4 + movdqa %xmm7, %xmm0 + movdqa %xmm4, %xmm1 + psrld $31, %xmm0 + psrld $31, %xmm1 + pslld $0x01, %xmm7 + pslld $0x01, %xmm4 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm0 + psrldq $12, %xmm2 + pslldq $4, %xmm1 + por %xmm2, %xmm4 + por %xmm0, %xmm7 + por %xmm1, %xmm4 + movdqa %xmm7, %xmm0 + movdqa %xmm7, %xmm1 + movdqa %xmm7, %xmm2 + pslld $31, %xmm0 + pslld $30, %xmm1 + pslld $25, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, %xmm1 + psrldq $4, %xmm1 + pslldq $12, %xmm0 + pxor %xmm0, %xmm7 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + movdqa %xmm7, %xmm0 + psrld $0x01, %xmm2 + psrld $2, %xmm3 + psrld $7, %xmm0 + pxor %xmm3, %xmm2 + pxor %xmm0, %xmm2 + pxor %xmm1, %xmm2 + pxor %xmm7, %xmm2 + pxor %xmm2, %xmm4 +L_AES_GCM_init_aesni_calc_iv_done: + # T = Encrypt counter + pxor %xmm0, %xmm0 + shll $3, %edx + pinsrd $0x00, %edx, %xmm0 + pxor %xmm0, %xmm4 + pshufd $0x4e, %xmm4, %xmm1 + pshufd $0x4e, %xmm5, %xmm2 + movdqa %xmm5, %xmm3 + movdqa %xmm5, %xmm0 + pclmulqdq $0x11, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm0, %xmm7 + movdqa %xmm3, %xmm4 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm4 + movdqa %xmm7, %xmm0 + movdqa %xmm4, %xmm1 + psrld $31, %xmm0 + psrld $31, %xmm1 + pslld $0x01, %xmm7 + pslld $0x01, %xmm4 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm0 + psrldq $12, %xmm2 + pslldq $4, %xmm1 + por %xmm2, %xmm4 + por %xmm0, %xmm7 + por %xmm1, %xmm4 + movdqa %xmm7, %xmm0 + movdqa %xmm7, %xmm1 + movdqa %xmm7, %xmm2 + pslld $31, %xmm0 + pslld $30, %xmm1 + pslld $25, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, %xmm1 + psrldq $4, %xmm1 + pslldq $12, %xmm0 + pxor %xmm0, %xmm7 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + movdqa %xmm7, %xmm0 + psrld $0x01, %xmm2 + psrld $2, %xmm3 + psrld $7, %xmm0 + pxor %xmm3, %xmm2 + pxor %xmm0, %xmm2 + pxor %xmm1, %xmm2 + pxor %xmm7, %xmm2 + pxor %xmm2, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + # Encrypt counter + movdqa (%ebp), %xmm0 + pxor %xmm4, %xmm0 + aesenc 16(%ebp), %xmm0 + aesenc 32(%ebp), %xmm0 + aesenc 48(%ebp), %xmm0 + aesenc 64(%ebp), %xmm0 + aesenc 80(%ebp), %xmm0 + aesenc 96(%ebp), %xmm0 + aesenc 112(%ebp), %xmm0 + aesenc 128(%ebp), %xmm0 + aesenc 144(%ebp), %xmm0 + cmpl $11, 40(%esp) + movdqa 160(%ebp), %xmm1 + jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last + aesenc %xmm1, %xmm0 + aesenc 176(%ebp), %xmm0 + cmpl $13, 40(%esp) + movdqa 192(%ebp), %xmm1 + jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last + aesenc %xmm1, %xmm0 + aesenc 208(%ebp), %xmm0 + movdqa 224(%ebp), %xmm1 +L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last: + aesenclast %xmm1, %xmm0 + movdqu %xmm0, (%edi) +L_AES_GCM_init_aesni_iv_done: + movl 52(%esp), %ebp + movl 56(%esp), %edi + pshufb L_aes_gcm_bswap_epi64, %xmm4 + paddd L_aes_gcm_one, %xmm4 + movdqa %xmm5, (%ebp) + movdqa %xmm4, (%edi) + addl $16, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_init_aesni,.-AES_GCM_init_aesni +.text +.globl AES_GCM_aad_update_aesni +.type AES_GCM_aad_update_aesni,@function +.align 16 +AES_GCM_aad_update_aesni: + pushl %esi + pushl %edi + movl 12(%esp), %esi + movl 16(%esp), %edx + movl 20(%esp), %edi + movl 24(%esp), %eax + movdqa (%edi), %xmm5 + movdqa (%eax), %xmm6 + xorl %ecx, %ecx +L_AES_GCM_aad_update_aesni_16_loop: + movdqu (%esi,%ecx,1), %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm0, %xmm5 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm6, %xmm2 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm5 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm4 + pxor %xmm1, %xmm5 + movdqa %xmm4, %xmm0 + movdqa %xmm5, %xmm1 + psrld $31, %xmm0 + psrld $31, %xmm1 + pslld $0x01, %xmm4 + pslld $0x01, %xmm5 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm0 + psrldq $12, %xmm2 + pslldq $4, %xmm1 + por %xmm2, %xmm5 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + movdqa %xmm4, %xmm0 + movdqa %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pslld $31, %xmm0 + pslld $30, %xmm1 + pslld $25, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, %xmm1 + psrldq $4, %xmm1 + pslldq $12, %xmm0 + pxor %xmm0, %xmm4 + movdqa %xmm4, %xmm2 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm0 + psrld $0x01, %xmm2 + psrld $2, %xmm3 + psrld $7, %xmm0 + pxor %xmm3, %xmm2 + pxor %xmm0, %xmm2 + pxor %xmm1, %xmm2 + pxor %xmm4, %xmm2 + pxor %xmm2, %xmm5 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_aad_update_aesni_16_loop + movdqa %xmm5, (%edi) + popl %edi + popl %esi + ret +.size AES_GCM_aad_update_aesni,.-AES_GCM_aad_update_aesni +.text +.globl AES_GCM_encrypt_block_aesni +.type AES_GCM_encrypt_block_aesni,@function +.align 16 +AES_GCM_encrypt_block_aesni: + pushl %esi + pushl %edi + movl 12(%esp), %ecx + movl 16(%esp), %eax + movl 20(%esp), %edi + movl 24(%esp), %esi + movl 28(%esp), %edx + movdqu (%edx), %xmm0 + movdqa %xmm0, %xmm1 + pshufb L_aes_gcm_bswap_epi64, %xmm0 + paddd L_aes_gcm_one, %xmm1 + pxor (%ecx), %xmm0 + movdqu %xmm1, (%edx) + aesenc 16(%ecx), %xmm0 + aesenc 32(%ecx), %xmm0 + aesenc 48(%ecx), %xmm0 + aesenc 64(%ecx), %xmm0 + aesenc 80(%ecx), %xmm0 + aesenc 96(%ecx), %xmm0 + aesenc 112(%ecx), %xmm0 + aesenc 128(%ecx), %xmm0 + aesenc 144(%ecx), %xmm0 + cmpl $11, %eax + movdqa 160(%ecx), %xmm1 + jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last + aesenc %xmm1, %xmm0 + aesenc 176(%ecx), %xmm0 + cmpl $13, %eax + movdqa 192(%ecx), %xmm1 + jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last + aesenc %xmm1, %xmm0 + aesenc 208(%ecx), %xmm0 + movdqa 224(%ecx), %xmm1 +L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last: + aesenclast %xmm1, %xmm0 + movdqu (%esi), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%edi) + pshufb L_aes_gcm_bswap_mask, %xmm0 + popl %edi + popl %esi + ret +.size AES_GCM_encrypt_block_aesni,.-AES_GCM_encrypt_block_aesni +.text +.globl AES_GCM_ghash_block_aesni +.type AES_GCM_ghash_block_aesni,@function +.align 16 +AES_GCM_ghash_block_aesni: + movl 4(%esp), %edx + movl 8(%esp), %eax + movl 12(%esp), %ecx + movdqa (%eax), %xmm4 + movdqa (%ecx), %xmm5 + movdqu (%edx), %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm0, %xmm4 + pshufd $0x4e, %xmm4, %xmm1 + pshufd $0x4e, %xmm5, %xmm2 + movdqa %xmm5, %xmm3 + movdqa %xmm5, %xmm0 + pclmulqdq $0x11, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm0, %xmm6 + movdqa %xmm3, %xmm4 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm6 + pxor %xmm1, %xmm4 + movdqa %xmm6, %xmm0 + movdqa %xmm4, %xmm1 + psrld $31, %xmm0 + psrld $31, %xmm1 + pslld $0x01, %xmm6 + pslld $0x01, %xmm4 + movdqa %xmm0, %xmm2 + pslldq $4, %xmm0 + psrldq $12, %xmm2 + pslldq $4, %xmm1 + por %xmm2, %xmm4 + por %xmm0, %xmm6 + por %xmm1, %xmm4 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm1 + movdqa %xmm6, %xmm2 + pslld $31, %xmm0 + pslld $30, %xmm1 + pslld $25, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, %xmm1 + psrldq $4, %xmm1 + pslldq $12, %xmm0 + pxor %xmm0, %xmm6 + movdqa %xmm6, %xmm2 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 + psrld $0x01, %xmm2 + psrld $2, %xmm3 + psrld $7, %xmm0 + pxor %xmm3, %xmm2 + pxor %xmm0, %xmm2 + pxor %xmm1, %xmm2 + pxor %xmm6, %xmm2 + pxor %xmm2, %xmm4 + movdqa %xmm4, (%eax) + ret +.size AES_GCM_ghash_block_aesni,.-AES_GCM_ghash_block_aesni +.text +.globl AES_GCM_encrypt_update_aesni +.type AES_GCM_encrypt_update_aesni,@function +.align 16 +AES_GCM_encrypt_update_aesni: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0x60, %esp + movl 144(%esp), %esi + movdqa (%esi), %xmm4 + movdqu %xmm4, 64(%esp) + movl 136(%esp), %esi + movl 140(%esp), %ebp + movdqa (%esi), %xmm6 + movdqa (%ebp), %xmm5 + movdqu %xmm6, 80(%esp) + movl 116(%esp), %ebp + movl 124(%esp), %edi + movl 128(%esp), %esi + movdqa %xmm5, %xmm1 + movdqa %xmm5, %xmm0 + psrlq $63, %xmm1 + psllq $0x01, %xmm0 + pslldq $8, %xmm1 + por %xmm1, %xmm0 + pshufd $0xff, %xmm5, %xmm5 + psrad $31, %xmm5 + pand L_aes_gcm_mod2_128, %xmm5 + pxor %xmm0, %xmm5 + xorl %ebx, %ebx + cmpl $0x40, 132(%esp) + movl 132(%esp), %eax + jl L_AES_GCM_encrypt_update_aesni_done_64 + andl $0xffffffc0, %eax + movdqa %xmm6, %xmm2 + # H ^ 1 + movdqu %xmm5, (%esp) + # H ^ 2 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm5, %xmm2 + movdqa %xmm5, %xmm3 + movdqa %xmm5, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm5, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm4 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm4 + movdqu %xmm4, 16(%esp) + # H ^ 3 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm4, %xmm2 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm7 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm7 + movdqu %xmm7, 32(%esp) + # H ^ 4 + pshufd $0x4e, %xmm4, %xmm1 + pshufd $0x4e, %xmm4, %xmm2 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm0 + pclmulqdq $0x11, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm7 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm7 + movdqu %xmm7, 48(%esp) + # First 64 bytes of input + # Encrypt 64 bytes of counter + movdqu 64(%esp), %xmm0 + movdqa L_aes_gcm_bswap_epi64, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pshufb %xmm7, %xmm0 + paddd L_aes_gcm_one, %xmm1 + pshufb %xmm7, %xmm1 + paddd L_aes_gcm_two, %xmm2 + pshufb %xmm7, %xmm2 + paddd L_aes_gcm_three, %xmm3 + pshufb %xmm7, %xmm3 + movdqu 64(%esp), %xmm7 + paddd L_aes_gcm_four, %xmm7 + movdqu %xmm7, 64(%esp) + movdqa (%ebp), %xmm7 + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + movdqa 16(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 32(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 48(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 64(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 80(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 96(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 112(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 128(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 144(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + cmpl $11, 120(%esp) + movdqa 160(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_aesni_enc_done + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 176(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + cmpl $13, 120(%esp) + movdqa 192(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_aesni_enc_done + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 208(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 224(%ebp), %xmm7 +L_AES_GCM_encrypt_update_aesni_enc_done: + aesenclast %xmm7, %xmm0 + aesenclast %xmm7, %xmm1 + movdqu (%esi), %xmm4 + movdqu 16(%esi), %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqu %xmm0, (%edi) + movdqu %xmm1, 16(%edi) + aesenclast %xmm7, %xmm2 + aesenclast %xmm7, %xmm3 + movdqu 32(%esi), %xmm4 + movdqu 48(%esi), %xmm5 + pxor %xmm4, %xmm2 + pxor %xmm5, %xmm3 + movdqu %xmm2, 32(%edi) + movdqu %xmm3, 48(%edi) + cmpl $0x40, %eax + movl $0x40, %ebx + jle L_AES_GCM_encrypt_update_aesni_end_64 + # More 64 bytes of input +L_AES_GCM_encrypt_update_aesni_ghash_64: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # Encrypt 64 bytes of counter + movdqu 64(%esp), %xmm0 + movdqa L_aes_gcm_bswap_epi64, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pshufb %xmm7, %xmm0 + paddd L_aes_gcm_one, %xmm1 + pshufb %xmm7, %xmm1 + paddd L_aes_gcm_two, %xmm2 + pshufb %xmm7, %xmm2 + paddd L_aes_gcm_three, %xmm3 + pshufb %xmm7, %xmm3 + movdqu 64(%esp), %xmm7 + paddd L_aes_gcm_four, %xmm7 + movdqu %xmm7, 64(%esp) + movdqa (%ebp), %xmm7 + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + movdqa 16(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 32(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 48(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 64(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 80(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 96(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 112(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 128(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 144(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + cmpl $11, 120(%esp) + movdqa 160(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 176(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + cmpl $13, 120(%esp) + movdqa 192(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 208(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 224(%ebp), %xmm7 +L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done: + aesenclast %xmm7, %xmm0 + aesenclast %xmm7, %xmm1 + movdqu (%ecx), %xmm4 + movdqu 16(%ecx), %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + aesenclast %xmm7, %xmm2 + aesenclast %xmm7, %xmm3 + movdqu 32(%ecx), %xmm4 + movdqu 48(%ecx), %xmm5 + pxor %xmm4, %xmm2 + pxor %xmm5, %xmm3 + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + # ghash encrypted counter + movdqu 80(%esp), %xmm2 + movdqu 48(%esp), %xmm7 + movdqu -64(%edx), %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0x4e, %xmm7, %xmm1 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm7, %xmm1 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm3 + pclmulqdq $0x11, %xmm7, %xmm3 + movdqa %xmm0, %xmm2 + pclmulqdq $0x00, %xmm7, %xmm2 + pclmulqdq $0x00, %xmm5, %xmm1 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqu 32(%esp), %xmm7 + movdqu -48(%edx), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqu 16(%esp), %xmm7 + movdqu -32(%edx), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqu (%esp), %xmm7 + movdqu -16(%edx), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqa %xmm1, %xmm5 + psrldq $8, %xmm1 + pslldq $8, %xmm5 + pxor %xmm5, %xmm2 + pxor %xmm1, %xmm3 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm5 + pslld $31, %xmm7 + pslld $30, %xmm4 + pslld $25, %xmm5 + pxor %xmm4, %xmm7 + pxor %xmm5, %xmm7 + movdqa %xmm7, %xmm4 + pslldq $12, %xmm7 + psrldq $4, %xmm4 + pxor %xmm7, %xmm2 + movdqa %xmm2, %xmm5 + movdqa %xmm2, %xmm1 + movdqa %xmm2, %xmm0 + psrld $0x01, %xmm5 + psrld $2, %xmm1 + psrld $7, %xmm0 + pxor %xmm1, %xmm5 + pxor %xmm0, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm2 + pxor %xmm3, %xmm2 + movdqu %xmm2, 80(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_update_aesni_ghash_64 +L_AES_GCM_encrypt_update_aesni_end_64: + movdqu 80(%esp), %xmm6 + # Block 1 + movdqa L_aes_gcm_bswap_mask, %xmm0 + movdqu (%edx), %xmm5 + pshufb %xmm0, %xmm5 + movdqu 48(%esp), %xmm7 + pxor %xmm6, %xmm5 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + movdqa %xmm7, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm7, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm6 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm4 + pxor %xmm1, %xmm6 + # Block 2 + movdqa L_aes_gcm_bswap_mask, %xmm0 + movdqu 16(%edx), %xmm5 + pshufb %xmm0, %xmm5 + movdqu 32(%esp), %xmm7 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + movdqa %xmm7, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm7, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + pxor %xmm0, %xmm4 + pxor %xmm3, %xmm6 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm4 + pxor %xmm1, %xmm6 + # Block 3 + movdqa L_aes_gcm_bswap_mask, %xmm0 + movdqu 32(%edx), %xmm5 + pshufb %xmm0, %xmm5 + movdqu 16(%esp), %xmm7 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + movdqa %xmm7, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm7, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + pxor %xmm0, %xmm4 + pxor %xmm3, %xmm6 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm4 + pxor %xmm1, %xmm6 + # Block 4 + movdqa L_aes_gcm_bswap_mask, %xmm0 + movdqu 48(%edx), %xmm5 + pshufb %xmm0, %xmm5 + movdqu (%esp), %xmm7 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + movdqa %xmm7, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm7, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + pxor %xmm0, %xmm4 + pxor %xmm3, %xmm6 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm4 + pxor %xmm1, %xmm6 + movdqa %xmm4, %xmm0 + movdqa %xmm4, %xmm1 + movdqa %xmm4, %xmm2 + pslld $31, %xmm0 + pslld $30, %xmm1 + pslld $25, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, %xmm1 + psrldq $4, %xmm1 + pslldq $12, %xmm0 + pxor %xmm0, %xmm4 + movdqa %xmm4, %xmm2 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm0 + psrld $0x01, %xmm2 + psrld $2, %xmm3 + psrld $7, %xmm0 + pxor %xmm3, %xmm2 + pxor %xmm0, %xmm2 + pxor %xmm1, %xmm2 + pxor %xmm4, %xmm2 + pxor %xmm2, %xmm6 + movdqu (%esp), %xmm5 +L_AES_GCM_encrypt_update_aesni_done_64: + movl 132(%esp), %edx + cmpl %edx, %ebx + jge L_AES_GCM_encrypt_update_aesni_done_enc + movl 132(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_update_aesni_last_block_done + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + movdqu 64(%esp), %xmm0 + movdqa %xmm0, %xmm1 + pshufb L_aes_gcm_bswap_epi64, %xmm0 + paddd L_aes_gcm_one, %xmm1 + pxor (%ebp), %xmm0 + movdqu %xmm1, 64(%esp) + aesenc 16(%ebp), %xmm0 + aesenc 32(%ebp), %xmm0 + aesenc 48(%ebp), %xmm0 + aesenc 64(%ebp), %xmm0 + aesenc 80(%ebp), %xmm0 + aesenc 96(%ebp), %xmm0 + aesenc 112(%ebp), %xmm0 + aesenc 128(%ebp), %xmm0 + aesenc 144(%ebp), %xmm0 + cmpl $11, 120(%esp) + movdqa 160(%ebp), %xmm1 + jl L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last + aesenc %xmm1, %xmm0 + aesenc 176(%ebp), %xmm0 + cmpl $13, 120(%esp) + movdqa 192(%ebp), %xmm1 + jl L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last + aesenc %xmm1, %xmm0 + aesenc 208(%ebp), %xmm0 + movdqa 224(%ebp), %xmm1 +L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last: + aesenclast %xmm1, %xmm0 + movdqu (%ecx), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%edx) + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm0, %xmm6 + addl $16, %ebx + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_update_aesni_last_block_ghash +L_AES_GCM_encrypt_update_aesni_last_block_start: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + movdqu 64(%esp), %xmm0 + movdqa %xmm0, %xmm1 + pshufb L_aes_gcm_bswap_epi64, %xmm0 + paddd L_aes_gcm_one, %xmm1 + pxor (%ebp), %xmm0 + movdqu %xmm1, 64(%esp) + movdqu %xmm6, %xmm4 + pclmulqdq $16, %xmm5, %xmm4 + aesenc 16(%ebp), %xmm0 + aesenc 32(%ebp), %xmm0 + movdqu %xmm6, %xmm7 + pclmulqdq $0x01, %xmm5, %xmm7 + aesenc 48(%ebp), %xmm0 + aesenc 64(%ebp), %xmm0 + aesenc 80(%ebp), %xmm0 + movdqu %xmm6, %xmm1 + pclmulqdq $0x11, %xmm5, %xmm1 + aesenc 96(%ebp), %xmm0 + pxor %xmm7, %xmm4 + movdqa %xmm4, %xmm2 + psrldq $8, %xmm4 + pslldq $8, %xmm2 + aesenc 112(%ebp), %xmm0 + movdqu %xmm6, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm7 + pxor %xmm7, %xmm2 + pxor %xmm4, %xmm1 + movdqa L_aes_gcm_mod2_128, %xmm3 + movdqa %xmm2, %xmm7 + pclmulqdq $16, %xmm3, %xmm7 + aesenc 128(%ebp), %xmm0 + pshufd $0x4e, %xmm2, %xmm4 + pxor %xmm7, %xmm4 + movdqa %xmm4, %xmm7 + pclmulqdq $16, %xmm3, %xmm7 + aesenc 144(%ebp), %xmm0 + pshufd $0x4e, %xmm4, %xmm6 + pxor %xmm7, %xmm6 + pxor %xmm1, %xmm6 + cmpl $11, 120(%esp) + movdqa 160(%ebp), %xmm1 + jl L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last + aesenc %xmm1, %xmm0 + aesenc 176(%ebp), %xmm0 + cmpl $13, 120(%esp) + movdqa 192(%ebp), %xmm1 + jl L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last + aesenc %xmm1, %xmm0 + aesenc 208(%ebp), %xmm0 + movdqa 224(%ebp), %xmm1 +L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last: + aesenclast %xmm1, %xmm0 + movdqu (%ecx), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%edx) + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm0, %xmm6 + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_update_aesni_last_block_start +L_AES_GCM_encrypt_update_aesni_last_block_ghash: + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm6, %xmm2 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm6 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm6 +L_AES_GCM_encrypt_update_aesni_last_block_done: +L_AES_GCM_encrypt_update_aesni_done_enc: + movl 136(%esp), %esi + movl 144(%esp), %edi + movdqu 64(%esp), %xmm4 + movdqa %xmm6, (%esi) + movdqu %xmm4, (%edi) + addl $0x60, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_encrypt_update_aesni,.-AES_GCM_encrypt_update_aesni +.text +.globl AES_GCM_encrypt_final_aesni +.type AES_GCM_encrypt_final_aesni,@function +.align 16 +AES_GCM_encrypt_final_aesni: + pushl %esi + pushl %edi + pushl %ebp + subl $16, %esp + movl 32(%esp), %ebp + movl 52(%esp), %esi + movl 56(%esp), %edi + movdqa (%ebp), %xmm4 + movdqa (%esi), %xmm5 + movdqa (%edi), %xmm6 + movdqa %xmm5, %xmm1 + movdqa %xmm5, %xmm0 + psrlq $63, %xmm1 + psllq $0x01, %xmm0 + pslldq $8, %xmm1 + por %xmm1, %xmm0 + pshufd $0xff, %xmm5, %xmm5 + psrad $31, %xmm5 + pand L_aes_gcm_mod2_128, %xmm5 + pxor %xmm0, %xmm5 + movl 44(%esp), %edx + movl 48(%esp), %ecx + shll $3, %edx + shll $3, %ecx + pinsrd $0x00, %edx, %xmm0 + pinsrd $2, %ecx, %xmm0 + movl 44(%esp), %edx + movl 48(%esp), %ecx + shrl $29, %edx + shrl $29, %ecx + pinsrd $0x01, %edx, %xmm0 + pinsrd $3, %ecx, %xmm0 + pxor %xmm0, %xmm4 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm4, %xmm2 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm4 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm4 + movdqu %xmm6, %xmm0 + pxor %xmm4, %xmm0 + movl 36(%esp), %edi + cmpl $16, 40(%esp) + je L_AES_GCM_encrypt_final_aesni_store_tag_16 + xorl %ecx, %ecx + movdqu %xmm0, (%esp) +L_AES_GCM_encrypt_final_aesni_store_tag_loop: + movzbl (%esp,%ecx,1), %eax + movb %al, (%edi,%ecx,1) + incl %ecx + cmpl 40(%esp), %ecx + jne L_AES_GCM_encrypt_final_aesni_store_tag_loop + jmp L_AES_GCM_encrypt_final_aesni_store_tag_done +L_AES_GCM_encrypt_final_aesni_store_tag_16: + movdqu %xmm0, (%edi) +L_AES_GCM_encrypt_final_aesni_store_tag_done: + addl $16, %esp + popl %ebp + popl %edi + popl %esi + ret +.size AES_GCM_encrypt_final_aesni,.-AES_GCM_encrypt_final_aesni +.text +.globl AES_GCM_decrypt_update_aesni +.type AES_GCM_decrypt_update_aesni,@function +.align 16 +AES_GCM_decrypt_update_aesni: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0xa0, %esp + movl 208(%esp), %esi + movdqa (%esi), %xmm4 + movdqu %xmm4, 64(%esp) + movl 200(%esp), %esi + movl 204(%esp), %ebp + movdqa (%esi), %xmm6 + movdqa (%ebp), %xmm5 + movdqu %xmm6, 80(%esp) + movl 180(%esp), %ebp + movl 188(%esp), %edi + movl 192(%esp), %esi + movdqa %xmm5, %xmm1 + movdqa %xmm5, %xmm0 + psrlq $63, %xmm1 + psllq $0x01, %xmm0 + pslldq $8, %xmm1 + por %xmm1, %xmm0 + pshufd $0xff, %xmm5, %xmm5 + psrad $31, %xmm5 + pand L_aes_gcm_mod2_128, %xmm5 + pxor %xmm0, %xmm5 + xorl %ebx, %ebx + cmpl $0x40, 196(%esp) + movl 196(%esp), %eax + jl L_AES_GCM_decrypt_update_aesni_done_64 + andl $0xffffffc0, %eax + movdqa %xmm6, %xmm2 + # H ^ 1 + movdqu %xmm5, (%esp) + # H ^ 2 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm5, %xmm2 + movdqa %xmm5, %xmm3 + movdqa %xmm5, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm5, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm4 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm4 + movdqu %xmm4, 16(%esp) + # H ^ 3 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm4, %xmm2 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm7 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm7 + movdqu %xmm7, 32(%esp) + # H ^ 4 + pshufd $0x4e, %xmm4, %xmm1 + pshufd $0x4e, %xmm4, %xmm2 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm0 + pclmulqdq $0x11, %xmm4, %xmm3 + pclmulqdq $0x00, %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm7 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm7 + movdqu %xmm7, 48(%esp) + cmpl %esi, %edi + jne L_AES_GCM_decrypt_update_aesni_ghash_64 +L_AES_GCM_decrypt_update_aesni_ghash_64_inplace: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # Encrypt 64 bytes of counter + movdqu 64(%esp), %xmm0 + movdqa L_aes_gcm_bswap_epi64, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pshufb %xmm7, %xmm0 + paddd L_aes_gcm_one, %xmm1 + pshufb %xmm7, %xmm1 + paddd L_aes_gcm_two, %xmm2 + pshufb %xmm7, %xmm2 + paddd L_aes_gcm_three, %xmm3 + pshufb %xmm7, %xmm3 + movdqu 64(%esp), %xmm7 + paddd L_aes_gcm_four, %xmm7 + movdqu %xmm7, 64(%esp) + movdqa (%ebp), %xmm7 + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + movdqa 16(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 32(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 48(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 64(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 80(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 96(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 112(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 128(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 144(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + cmpl $11, 184(%esp) + movdqa 160(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 176(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + cmpl $13, 184(%esp) + movdqa 192(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 208(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 224(%ebp), %xmm7 +L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done: + aesenclast %xmm7, %xmm0 + aesenclast %xmm7, %xmm1 + movdqu (%ecx), %xmm4 + movdqu 16(%ecx), %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqu %xmm4, 96(%esp) + movdqu %xmm5, 112(%esp) + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + aesenclast %xmm7, %xmm2 + aesenclast %xmm7, %xmm3 + movdqu 32(%ecx), %xmm4 + movdqu 48(%ecx), %xmm5 + pxor %xmm4, %xmm2 + pxor %xmm5, %xmm3 + movdqu %xmm4, 128(%esp) + movdqu %xmm5, 144(%esp) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + # ghash encrypted counter + movdqu 80(%esp), %xmm2 + movdqu 48(%esp), %xmm7 + movdqu 96(%esp), %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0x4e, %xmm7, %xmm1 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm7, %xmm1 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm3 + pclmulqdq $0x11, %xmm7, %xmm3 + movdqa %xmm0, %xmm2 + pclmulqdq $0x00, %xmm7, %xmm2 + pclmulqdq $0x00, %xmm5, %xmm1 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqu 32(%esp), %xmm7 + movdqu 112(%esp), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqu 16(%esp), %xmm7 + movdqu 128(%esp), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqu (%esp), %xmm7 + movdqu 144(%esp), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqa %xmm1, %xmm5 + psrldq $8, %xmm1 + pslldq $8, %xmm5 + pxor %xmm5, %xmm2 + pxor %xmm1, %xmm3 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm5 + pslld $31, %xmm7 + pslld $30, %xmm4 + pslld $25, %xmm5 + pxor %xmm4, %xmm7 + pxor %xmm5, %xmm7 + movdqa %xmm7, %xmm4 + pslldq $12, %xmm7 + psrldq $4, %xmm4 + pxor %xmm7, %xmm2 + movdqa %xmm2, %xmm5 + movdqa %xmm2, %xmm1 + movdqa %xmm2, %xmm0 + psrld $0x01, %xmm5 + psrld $2, %xmm1 + psrld $7, %xmm0 + pxor %xmm1, %xmm5 + pxor %xmm0, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm2 + pxor %xmm3, %xmm2 + movdqu %xmm2, 80(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_aesni_ghash_64_inplace + jmp L_AES_GCM_decrypt_update_aesni_ghash_64_done +L_AES_GCM_decrypt_update_aesni_ghash_64: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # Encrypt 64 bytes of counter + movdqu 64(%esp), %xmm0 + movdqa L_aes_gcm_bswap_epi64, %xmm7 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pshufb %xmm7, %xmm0 + paddd L_aes_gcm_one, %xmm1 + pshufb %xmm7, %xmm1 + paddd L_aes_gcm_two, %xmm2 + pshufb %xmm7, %xmm2 + paddd L_aes_gcm_three, %xmm3 + pshufb %xmm7, %xmm3 + movdqu 64(%esp), %xmm7 + paddd L_aes_gcm_four, %xmm7 + movdqu %xmm7, 64(%esp) + movdqa (%ebp), %xmm7 + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + movdqa 16(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 32(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 48(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 64(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 80(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 96(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 112(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 128(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 144(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + cmpl $11, 184(%esp) + movdqa 160(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 176(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + cmpl $13, 184(%esp) + movdqa 192(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 208(%ebp), %xmm7 + aesenc %xmm7, %xmm0 + aesenc %xmm7, %xmm1 + aesenc %xmm7, %xmm2 + aesenc %xmm7, %xmm3 + movdqa 224(%ebp), %xmm7 +L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done: + aesenclast %xmm7, %xmm0 + aesenclast %xmm7, %xmm1 + movdqu (%ecx), %xmm4 + movdqu 16(%ecx), %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqu %xmm4, (%ecx) + movdqu %xmm5, 16(%ecx) + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + aesenclast %xmm7, %xmm2 + aesenclast %xmm7, %xmm3 + movdqu 32(%ecx), %xmm4 + movdqu 48(%ecx), %xmm5 + pxor %xmm4, %xmm2 + pxor %xmm5, %xmm3 + movdqu %xmm4, 32(%ecx) + movdqu %xmm5, 48(%ecx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + # ghash encrypted counter + movdqu 80(%esp), %xmm2 + movdqu 48(%esp), %xmm7 + movdqu (%ecx), %xmm0 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm2, %xmm0 + pshufd $0x4e, %xmm7, %xmm1 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm7, %xmm1 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm3 + pclmulqdq $0x11, %xmm7, %xmm3 + movdqa %xmm0, %xmm2 + pclmulqdq $0x00, %xmm7, %xmm2 + pclmulqdq $0x00, %xmm5, %xmm1 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqu 32(%esp), %xmm7 + movdqu 16(%ecx), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqu 16(%esp), %xmm7 + movdqu 32(%ecx), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqu (%esp), %xmm7 + movdqu 48(%ecx), %xmm0 + pshufd $0x4e, %xmm7, %xmm4 + pshufb L_aes_gcm_bswap_mask, %xmm0 + pxor %xmm7, %xmm4 + pshufd $0x4e, %xmm0, %xmm5 + pxor %xmm0, %xmm5 + movdqa %xmm0, %xmm6 + pclmulqdq $0x11, %xmm7, %xmm6 + pclmulqdq $0x00, %xmm0, %xmm7 + pclmulqdq $0x00, %xmm5, %xmm4 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm6, %xmm1 + pxor %xmm6, %xmm3 + pxor %xmm4, %xmm1 + movdqa %xmm1, %xmm5 + psrldq $8, %xmm1 + pslldq $8, %xmm5 + pxor %xmm5, %xmm2 + pxor %xmm1, %xmm3 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm5 + pslld $31, %xmm7 + pslld $30, %xmm4 + pslld $25, %xmm5 + pxor %xmm4, %xmm7 + pxor %xmm5, %xmm7 + movdqa %xmm7, %xmm4 + pslldq $12, %xmm7 + psrldq $4, %xmm4 + pxor %xmm7, %xmm2 + movdqa %xmm2, %xmm5 + movdqa %xmm2, %xmm1 + movdqa %xmm2, %xmm0 + psrld $0x01, %xmm5 + psrld $2, %xmm1 + psrld $7, %xmm0 + pxor %xmm1, %xmm5 + pxor %xmm0, %xmm5 + pxor %xmm4, %xmm5 + pxor %xmm5, %xmm2 + pxor %xmm3, %xmm2 + movdqu %xmm2, 80(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_aesni_ghash_64 +L_AES_GCM_decrypt_update_aesni_ghash_64_done: + movdqa %xmm2, %xmm6 + movdqu (%esp), %xmm5 +L_AES_GCM_decrypt_update_aesni_done_64: + movl 196(%esp), %edx + cmpl %edx, %ebx + jge L_AES_GCM_decrypt_update_aesni_done_dec + movl 196(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_decrypt_update_aesni_last_block_done +L_AES_GCM_decrypt_update_aesni_last_block_start: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + movdqu (%ecx), %xmm1 + pshufb L_aes_gcm_bswap_mask, %xmm1 + pxor %xmm6, %xmm1 + movdqu %xmm1, (%esp) + movdqu 64(%esp), %xmm0 + movdqa %xmm0, %xmm1 + pshufb L_aes_gcm_bswap_epi64, %xmm0 + paddd L_aes_gcm_one, %xmm1 + pxor (%ebp), %xmm0 + movdqu %xmm1, 64(%esp) + movdqu (%esp), %xmm4 + pclmulqdq $16, %xmm5, %xmm4 + aesenc 16(%ebp), %xmm0 + aesenc 32(%ebp), %xmm0 + movdqu (%esp), %xmm7 + pclmulqdq $0x01, %xmm5, %xmm7 + aesenc 48(%ebp), %xmm0 + aesenc 64(%ebp), %xmm0 + aesenc 80(%ebp), %xmm0 + movdqu (%esp), %xmm1 + pclmulqdq $0x11, %xmm5, %xmm1 + aesenc 96(%ebp), %xmm0 + pxor %xmm7, %xmm4 + movdqa %xmm4, %xmm2 + psrldq $8, %xmm4 + pslldq $8, %xmm2 + aesenc 112(%ebp), %xmm0 + movdqu (%esp), %xmm7 + pclmulqdq $0x00, %xmm5, %xmm7 + pxor %xmm7, %xmm2 + pxor %xmm4, %xmm1 + movdqa L_aes_gcm_mod2_128, %xmm3 + movdqa %xmm2, %xmm7 + pclmulqdq $16, %xmm3, %xmm7 + aesenc 128(%ebp), %xmm0 + pshufd $0x4e, %xmm2, %xmm4 + pxor %xmm7, %xmm4 + movdqa %xmm4, %xmm7 + pclmulqdq $16, %xmm3, %xmm7 + aesenc 144(%ebp), %xmm0 + pshufd $0x4e, %xmm4, %xmm6 + pxor %xmm7, %xmm6 + pxor %xmm1, %xmm6 + cmpl $11, 184(%esp) + movdqa 160(%ebp), %xmm1 + jl L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last + aesenc %xmm1, %xmm0 + aesenc 176(%ebp), %xmm0 + cmpl $13, 184(%esp) + movdqa 192(%ebp), %xmm1 + jl L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last + aesenc %xmm1, %xmm0 + aesenc 208(%ebp), %xmm0 + movdqa 224(%ebp), %xmm1 +L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last: + aesenclast %xmm1, %xmm0 + movdqu (%ecx), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%edx) + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_aesni_last_block_start +L_AES_GCM_decrypt_update_aesni_last_block_done: +L_AES_GCM_decrypt_update_aesni_done_dec: + movl 200(%esp), %esi + movl 208(%esp), %edi + movdqu 64(%esp), %xmm4 + movdqa %xmm6, (%esi) + movdqu %xmm4, (%edi) + addl $0xa0, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_update_aesni,.-AES_GCM_decrypt_update_aesni +.text +.globl AES_GCM_decrypt_final_aesni +.type AES_GCM_decrypt_final_aesni,@function +.align 16 +AES_GCM_decrypt_final_aesni: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $16, %esp + movl 36(%esp), %ebp + movl 56(%esp), %esi + movl 60(%esp), %edi + movdqa (%ebp), %xmm6 + movdqa (%esi), %xmm5 + movdqa (%edi), %xmm7 + movdqa %xmm5, %xmm1 + movdqa %xmm5, %xmm0 + psrlq $63, %xmm1 + psllq $0x01, %xmm0 + pslldq $8, %xmm1 + por %xmm1, %xmm0 + pshufd $0xff, %xmm5, %xmm5 + psrad $31, %xmm5 + pand L_aes_gcm_mod2_128, %xmm5 + pxor %xmm0, %xmm5 + movl 48(%esp), %edx + movl 52(%esp), %ecx + shll $3, %edx + shll $3, %ecx + pinsrd $0x00, %edx, %xmm0 + pinsrd $2, %ecx, %xmm0 + movl 48(%esp), %edx + movl 52(%esp), %ecx + shrl $29, %edx + shrl $29, %ecx + pinsrd $0x01, %edx, %xmm0 + pinsrd $3, %ecx, %xmm0 + pxor %xmm0, %xmm6 + pshufd $0x4e, %xmm5, %xmm1 + pshufd $0x4e, %xmm6, %xmm2 + movdqa %xmm6, %xmm3 + movdqa %xmm6, %xmm0 + pclmulqdq $0x11, %xmm5, %xmm3 + pclmulqdq $0x00, %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pclmulqdq $0x00, %xmm2, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm2 + movdqa %xmm3, %xmm6 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + movdqa %xmm0, %xmm3 + pslld $31, %xmm1 + pslld $30, %xmm2 + pslld $25, %xmm3 + pxor %xmm2, %xmm1 + pxor %xmm3, %xmm1 + movdqa %xmm1, %xmm3 + psrldq $4, %xmm3 + pslldq $12, %xmm1 + pxor %xmm1, %xmm0 + movdqa %xmm0, %xmm1 + movdqa %xmm0, %xmm2 + psrld $0x01, %xmm1 + psrld $2, %xmm2 + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm1 + psrld $7, %xmm0 + pxor %xmm3, %xmm1 + pxor %xmm0, %xmm1 + pxor %xmm1, %xmm6 + pshufb L_aes_gcm_bswap_mask, %xmm6 + movdqu %xmm7, %xmm0 + pxor %xmm6, %xmm0 + movl 40(%esp), %esi + movl 64(%esp), %edi + cmpl $16, 44(%esp) + je L_AES_GCM_decrypt_final_aesni_cmp_tag_16 + subl $16, %esp + xorl %ecx, %ecx + xorl %ebx, %ebx + movdqu %xmm0, (%esp) +L_AES_GCM_decrypt_final_aesni_cmp_tag_loop: + movzbl (%esp,%ecx,1), %eax + xorb (%esi,%ecx,1), %al + orb %al, %bl + incl %ecx + cmpl 44(%esp), %ecx + jne L_AES_GCM_decrypt_final_aesni_cmp_tag_loop + cmpb $0x00, %bl + sete %bl + addl $16, %esp + xorl %ecx, %ecx + jmp L_AES_GCM_decrypt_final_aesni_cmp_tag_done +L_AES_GCM_decrypt_final_aesni_cmp_tag_16: + movdqu (%esi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ebx, %ebx + cmpl $0xffff, %edx + sete %bl +L_AES_GCM_decrypt_final_aesni_cmp_tag_done: + movl %ebx, (%edi) + addl $16, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_final_aesni,.-AES_GCM_decrypt_final_aesni +#endif /* WOLFSSL_AESGCM_STREAM */ +#ifdef HAVE_INTEL_AVX1 +.text +.globl AES_GCM_encrypt_avx1 +.type AES_GCM_encrypt_avx1,@function +.align 16 +AES_GCM_encrypt_avx1: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0x70, %esp + movl 144(%esp), %esi + movl 168(%esp), %ebp + movl 160(%esp), %edx + vpxor %xmm0, %xmm0, %xmm0 + vpxor %xmm2, %xmm2, %xmm2 + cmpl $12, %edx + jne L_AES_GCM_encrypt_avx1_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vpinsrd $0x00, (%esi), %xmm0, %xmm0 + vpinsrd $0x01, 4(%esi), %xmm0, %xmm0 + vpinsrd $2, 8(%esi), %xmm0, %xmm0 + vpinsrd $3, %ecx, %xmm0, %xmm0 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%ebp), %xmm1 + vpxor %xmm1, %xmm0, %xmm5 + vmovdqa 16(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 32(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 48(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 64(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 80(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 96(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 112(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 128(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 144(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + cmpl $11, 172(%esp) + vmovdqa 160(%ebp), %xmm3 + jl L_AES_GCM_encrypt_avx1_calc_iv_12_last + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 176(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + cmpl $13, 172(%esp) + vmovdqa 192(%ebp), %xmm3 + jl L_AES_GCM_encrypt_avx1_calc_iv_12_last + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 208(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 224(%ebp), %xmm3 +L_AES_GCM_encrypt_avx1_calc_iv_12_last: + vaesenclast %xmm3, %xmm1, %xmm1 + vaesenclast %xmm3, %xmm5, %xmm5 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1 + vmovdqu %xmm5, 80(%esp) + jmp L_AES_GCM_encrypt_avx1_iv_done +L_AES_GCM_encrypt_avx1_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%ebp), %xmm1 + vaesenc 16(%ebp), %xmm1, %xmm1 + vaesenc 32(%ebp), %xmm1, %xmm1 + vaesenc 48(%ebp), %xmm1, %xmm1 + vaesenc 64(%ebp), %xmm1, %xmm1 + vaesenc 80(%ebp), %xmm1, %xmm1 + vaesenc 96(%ebp), %xmm1, %xmm1 + vaesenc 112(%ebp), %xmm1, %xmm1 + vaesenc 128(%ebp), %xmm1, %xmm1 + vaesenc 144(%ebp), %xmm1, %xmm1 + cmpl $11, 172(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc %xmm5, %xmm1, %xmm1 + vaesenc 176(%ebp), %xmm1, %xmm1 + cmpl $13, 172(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc %xmm5, %xmm1, %xmm1 + vaesenc 208(%ebp), %xmm1, %xmm1 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm5, %xmm1, %xmm1 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_encrypt_avx1_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_encrypt_avx1_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_avx1_calc_iv_16_loop: + vmovdqu (%esi,%ecx,1), %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm0, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm0 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm0, %xmm0 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm0, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm0, %xmm0 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm0, %xmm0 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm0, %xmm0 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm0, %xmm0 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx1_calc_iv_16_loop + movl 160(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_avx1_calc_iv_done +L_AES_GCM_encrypt_avx1_calc_iv_lt16: + subl $16, %esp + vpxor %xmm4, %xmm4, %xmm4 + xorl %ebx, %ebx + vmovdqu %xmm4, (%esp) +L_AES_GCM_encrypt_avx1_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx1_calc_iv_loop + vmovdqu (%esp), %xmm4 + addl $16, %esp + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm0, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm0 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm0, %xmm0 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm0, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm0, %xmm0 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm0, %xmm0 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm0, %xmm0 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm0, %xmm0 +L_AES_GCM_encrypt_avx1_calc_iv_done: + # T = Encrypt counter + vpxor %xmm4, %xmm4, %xmm4 + shll $3, %edx + vpinsrd $0x00, %edx, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm0, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm0 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm0, %xmm0 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm0, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm0, %xmm0 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm0, %xmm0 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm0, %xmm0 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm0, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + # Encrypt counter + vmovdqa (%ebp), %xmm4 + vpxor %xmm0, %xmm4, %xmm4 + vaesenc 16(%ebp), %xmm4, %xmm4 + vaesenc 32(%ebp), %xmm4, %xmm4 + vaesenc 48(%ebp), %xmm4, %xmm4 + vaesenc 64(%ebp), %xmm4, %xmm4 + vaesenc 80(%ebp), %xmm4, %xmm4 + vaesenc 96(%ebp), %xmm4, %xmm4 + vaesenc 112(%ebp), %xmm4, %xmm4 + vaesenc 128(%ebp), %xmm4, %xmm4 + vaesenc 144(%ebp), %xmm4, %xmm4 + cmpl $11, 172(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 176(%ebp), %xmm4, %xmm4 + cmpl $13, 172(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 208(%ebp), %xmm4, %xmm4 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm5, %xmm4, %xmm4 + vmovdqu %xmm4, 80(%esp) +L_AES_GCM_encrypt_avx1_iv_done: + movl 140(%esp), %esi + # Additional authentication data + movl 156(%esp), %edx + cmpl $0x00, %edx + je L_AES_GCM_encrypt_avx1_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_encrypt_avx1_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_avx1_calc_aad_16_loop: + vmovdqu (%esi,%ecx,1), %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm2, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm4 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm2 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm2, %xmm2 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm2, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm2, %xmm2 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm2, %xmm2 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm2, %xmm2 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx1_calc_aad_16_loop + movl 156(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_avx1_calc_aad_done +L_AES_GCM_encrypt_avx1_calc_aad_lt16: + subl $16, %esp + vpxor %xmm4, %xmm4, %xmm4 + xorl %ebx, %ebx + vmovdqu %xmm4, (%esp) +L_AES_GCM_encrypt_avx1_calc_aad_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx1_calc_aad_loop + vmovdqu (%esp), %xmm4 + addl $16, %esp + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm2, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm4 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm2 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm2, %xmm2 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm2, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm2, %xmm2 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm2, %xmm2 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm2, %xmm2 +L_AES_GCM_encrypt_avx1_calc_aad_done: + vmovdqu %xmm2, 96(%esp) + movl 132(%esp), %esi + movl 136(%esp), %edi + # Calculate counter and H + vpsrlq $63, %xmm1, %xmm5 + vpsllq $0x01, %xmm1, %xmm4 + vpslldq $8, %xmm5, %xmm5 + vpor %xmm5, %xmm4, %xmm4 + vpshufd $0xff, %xmm1, %xmm1 + vpsrad $31, %xmm1, %xmm1 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0 + vpand L_aes_gcm_avx1_mod2_128, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm0, 64(%esp) + xorl %ebx, %ebx + cmpl $0x40, 152(%esp) + movl 152(%esp), %eax + jl L_AES_GCM_encrypt_avx1_done_64 + andl $0xffffffc0, %eax + vmovdqa %xmm2, %xmm6 + # H ^ 1 + vmovdqu %xmm1, (%esp) + # H ^ 2 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm0 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm0, %xmm0 + vmovdqu %xmm0, 16(%esp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm0, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm0, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm4, %xmm4 + vpxor %xmm5, %xmm7, %xmm3 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu %xmm3, 32(%esp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm3 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu %xmm3, 48(%esp) + # First 64 bytes of input + vmovdqu 64(%esp), %xmm4 + vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3 + vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5 + vpshufb %xmm3, %xmm5, %xmm5 + vpaddd L_aes_gcm_avx1_two, %xmm4, %xmm6 + vpshufb %xmm3, %xmm6, %xmm6 + vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7 + vpshufb %xmm3, %xmm7, %xmm7 + vpshufb %xmm3, %xmm4, %xmm4 + vmovdqu 64(%esp), %xmm3 + vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3 + vmovdqu %xmm3, 64(%esp) + vmovdqa (%ebp), %xmm3 + vpxor %xmm3, %xmm4, %xmm4 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm3, %xmm7, %xmm7 + vmovdqa 16(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 32(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 48(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 64(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 80(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 96(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 112(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 128(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 144(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + cmpl $11, 172(%esp) + vmovdqa 160(%ebp), %xmm3 + jl L_AES_GCM_encrypt_avx1_aesenc_64_enc_done + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 176(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + cmpl $13, 172(%esp) + vmovdqa 192(%ebp), %xmm3 + jl L_AES_GCM_encrypt_avx1_aesenc_64_enc_done + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 208(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 224(%ebp), %xmm3 +L_AES_GCM_encrypt_avx1_aesenc_64_enc_done: + vaesenclast %xmm3, %xmm4, %xmm4 + vaesenclast %xmm3, %xmm5, %xmm5 + vmovdqu (%esi), %xmm0 + vmovdqu 16(%esi), %xmm1 + vpxor %xmm0, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vmovdqu %xmm0, (%esi) + vmovdqu %xmm1, 16(%esi) + vmovdqu %xmm4, (%edi) + vmovdqu %xmm5, 16(%edi) + vaesenclast %xmm3, %xmm6, %xmm6 + vaesenclast %xmm3, %xmm7, %xmm7 + vmovdqu 32(%esi), %xmm0 + vmovdqu 48(%esi), %xmm1 + vpxor %xmm0, %xmm6, %xmm6 + vpxor %xmm1, %xmm7, %xmm7 + vmovdqu %xmm0, 32(%esi) + vmovdqu %xmm1, 48(%esi) + vmovdqu %xmm6, 32(%edi) + vmovdqu %xmm7, 48(%edi) + cmpl $0x40, %eax + movl $0x40, %ebx + movl %esi, %ecx + movl %edi, %edx + jle L_AES_GCM_encrypt_avx1_end_64 + # More 64 bytes of input +L_AES_GCM_encrypt_avx1_ghash_64: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm4 + vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3 + vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5 + vpshufb %xmm3, %xmm5, %xmm5 + vpaddd L_aes_gcm_avx1_two, %xmm4, %xmm6 + vpshufb %xmm3, %xmm6, %xmm6 + vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7 + vpshufb %xmm3, %xmm7, %xmm7 + vpshufb %xmm3, %xmm4, %xmm4 + vmovdqu 64(%esp), %xmm3 + vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3 + vmovdqu %xmm3, 64(%esp) + vmovdqa (%ebp), %xmm3 + vpxor %xmm3, %xmm4, %xmm4 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm3, %xmm7, %xmm7 + vmovdqa 16(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 32(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 48(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 64(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 80(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 96(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 112(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 128(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 144(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + cmpl $11, 172(%esp) + vmovdqa 160(%ebp), %xmm3 + jl L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 176(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + cmpl $13, 172(%esp) + vmovdqa 192(%ebp), %xmm3 + jl L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 208(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 224(%ebp), %xmm3 +L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast %xmm3, %xmm4, %xmm4 + vaesenclast %xmm3, %xmm5, %xmm5 + vmovdqu (%ecx), %xmm0 + vmovdqu 16(%ecx), %xmm1 + vpxor %xmm0, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vmovdqu %xmm4, (%edx) + vmovdqu %xmm5, 16(%edx) + vaesenclast %xmm3, %xmm6, %xmm6 + vaesenclast %xmm3, %xmm7, %xmm7 + vmovdqu 32(%ecx), %xmm0 + vmovdqu 48(%ecx), %xmm1 + vpxor %xmm0, %xmm6, %xmm6 + vpxor %xmm1, %xmm7, %xmm7 + vmovdqu %xmm6, 32(%edx) + vmovdqu %xmm7, 48(%edx) + # ghash encrypted counter + vmovdqu 96(%esp), %xmm6 + vmovdqu 48(%esp), %xmm3 + vmovdqu -64(%edx), %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpshufd $0x4e, %xmm3, %xmm5 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm7 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm6 + vpclmulqdq $0x00, %xmm1, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqu 32(%esp), %xmm3 + vmovdqu -48(%edx), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vmovdqu 16(%esp), %xmm3 + vmovdqu -32(%edx), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vmovdqu (%esp), %xmm3 + vmovdqu -16(%edx), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpslld $31, %xmm6, %xmm3 + vpslld $30, %xmm6, %xmm0 + vpslld $25, %xmm6, %xmm1 + vpxor %xmm0, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpsrldq $4, %xmm3, %xmm0 + vpslldq $12, %xmm3, %xmm3 + vpxor %xmm3, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm1 + vpsrld $2, %xmm6, %xmm5 + vpsrld $7, %xmm6, %xmm4 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu %xmm6, 96(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_avx1_ghash_64 +L_AES_GCM_encrypt_avx1_end_64: + vmovdqu 96(%esp), %xmm2 + # Block 1 + vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4 + vmovdqa (%edx), %xmm1 + vpshufb %xmm4, %xmm1, %xmm1 + vmovdqu 48(%esp), %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm3, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm0 + vmovdqa %xmm7, %xmm2 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm0, %xmm0 + vpxor %xmm5, %xmm2, %xmm2 + # Block 2 + vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4 + vmovdqa 16(%edx), %xmm1 + vpshufb %xmm4, %xmm1, %xmm1 + vmovdqu 32(%esp), %xmm3 + # ghash_gfmul_xor_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm3, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm2, %xmm2 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm0, %xmm0 + vpxor %xmm5, %xmm2, %xmm2 + # Block 3 + vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4 + vmovdqa 32(%edx), %xmm1 + vpshufb %xmm4, %xmm1, %xmm1 + vmovdqu 16(%esp), %xmm3 + # ghash_gfmul_xor_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm3, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm2, %xmm2 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm0, %xmm0 + vpxor %xmm5, %xmm2, %xmm2 + # Block 4 + vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4 + vmovdqa 48(%edx), %xmm1 + vpshufb %xmm4, %xmm1, %xmm1 + vmovdqu (%esp), %xmm3 + # ghash_gfmul_xor_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm3, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm3, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm3, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm2, %xmm2 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm0, %xmm0 + vpxor %xmm5, %xmm2, %xmm2 + vpslld $31, %xmm0, %xmm4 + vpslld $30, %xmm0, %xmm5 + vpslld $25, %xmm0, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm6 + vpsrld $2, %xmm0, %xmm7 + vpsrld $7, %xmm0, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm0, %xmm6, %xmm6 + vpxor %xmm6, %xmm2, %xmm2 + vmovdqu (%esp), %xmm1 +L_AES_GCM_encrypt_avx1_done_64: + movl 152(%esp), %edx + cmpl %edx, %ebx + jge L_AES_GCM_encrypt_avx1_done_enc + movl 152(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_avx1_last_block_done + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm5 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4 + vpaddd L_aes_gcm_avx1_one, %xmm5, %xmm5 + vmovdqu %xmm5, 64(%esp) + vpxor (%ebp), %xmm4, %xmm4 + vaesenc 16(%ebp), %xmm4, %xmm4 + vaesenc 32(%ebp), %xmm4, %xmm4 + vaesenc 48(%ebp), %xmm4, %xmm4 + vaesenc 64(%ebp), %xmm4, %xmm4 + vaesenc 80(%ebp), %xmm4, %xmm4 + vaesenc 96(%ebp), %xmm4, %xmm4 + vaesenc 112(%ebp), %xmm4, %xmm4 + vaesenc 128(%ebp), %xmm4, %xmm4 + vaesenc 144(%ebp), %xmm4, %xmm4 + cmpl $11, 172(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 176(%ebp), %xmm4, %xmm4 + cmpl $13, 172(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 208(%ebp), %xmm4, %xmm4 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last: + vaesenclast %xmm5, %xmm4, %xmm4 + vmovdqu (%ecx), %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vmovdqu %xmm4, (%edx) + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + addl $16, %ebx + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_avx1_last_block_ghash +L_AES_GCM_encrypt_avx1_last_block_start: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm5 + vmovdqu %xmm2, %xmm7 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4 + vpaddd L_aes_gcm_avx1_one, %xmm5, %xmm5 + vmovdqu %xmm5, 64(%esp) + vpxor (%ebp), %xmm4, %xmm4 + vpclmulqdq $16, %xmm1, %xmm7, %xmm0 + vaesenc 16(%ebp), %xmm4, %xmm4 + vaesenc 32(%ebp), %xmm4, %xmm4 + vpclmulqdq $0x01, %xmm1, %xmm7, %xmm3 + vaesenc 48(%ebp), %xmm4, %xmm4 + vaesenc 64(%ebp), %xmm4, %xmm4 + vaesenc 80(%ebp), %xmm4, %xmm4 + vpclmulqdq $0x11, %xmm1, %xmm7, %xmm5 + vaesenc 96(%ebp), %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpslldq $8, %xmm0, %xmm6 + vpsrldq $8, %xmm0, %xmm0 + vaesenc 112(%ebp), %xmm4, %xmm4 + vpclmulqdq $0x00, %xmm1, %xmm7, %xmm3 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm0, %xmm5, %xmm5 + vmovdqa L_aes_gcm_avx1_mod2_128, %xmm7 + vpclmulqdq $16, %xmm7, %xmm6, %xmm3 + vaesenc 128(%ebp), %xmm4, %xmm4 + vpshufd $0x4e, %xmm6, %xmm0 + vpxor %xmm3, %xmm0, %xmm0 + vpclmulqdq $16, %xmm7, %xmm0, %xmm3 + vaesenc 144(%ebp), %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm2 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm5, %xmm2, %xmm2 + cmpl $11, 172(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_aesenc_gfmul_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 176(%ebp), %xmm4, %xmm4 + cmpl $13, 172(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_aesenc_gfmul_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 208(%ebp), %xmm4, %xmm4 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_avx1_aesenc_gfmul_last: + vaesenclast %xmm5, %xmm4, %xmm4 + vmovdqu (%ecx), %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vmovdqu %xmm4, (%edx) + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + addl $16, %ebx + vpxor %xmm4, %xmm2, %xmm2 + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_avx1_last_block_start +L_AES_GCM_encrypt_avx1_last_block_ghash: + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm2, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm2, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm2, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm2, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm4, %xmm4 + vpxor %xmm5, %xmm7, %xmm2 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm2, %xmm2 +L_AES_GCM_encrypt_avx1_last_block_done: + movl 152(%esp), %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done + vmovdqu 64(%esp), %xmm0 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0 + vpxor (%ebp), %xmm0, %xmm0 + vaesenc 16(%ebp), %xmm0, %xmm0 + vaesenc 32(%ebp), %xmm0, %xmm0 + vaesenc 48(%ebp), %xmm0, %xmm0 + vaesenc 64(%ebp), %xmm0, %xmm0 + vaesenc 80(%ebp), %xmm0, %xmm0 + vaesenc 96(%ebp), %xmm0, %xmm0 + vaesenc 112(%ebp), %xmm0, %xmm0 + vaesenc 128(%ebp), %xmm0, %xmm0 + vaesenc 144(%ebp), %xmm0, %xmm0 + cmpl $11, 172(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm5, %xmm0, %xmm0 + vaesenc 176(%ebp), %xmm0, %xmm0 + cmpl $13, 172(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm5, %xmm0, %xmm0 + vaesenc 208(%ebp), %xmm0, %xmm0 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast %xmm5, %xmm0, %xmm0 + subl $16, %esp + xorl %ecx, %ecx + vmovdqu %xmm0, (%esp) +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop: + movzbl (%esi,%ebx,1), %eax + xorb (%esp,%ecx,1), %al + movb %al, (%edi,%ebx,1) + movb %al, (%esp,%ecx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop + xorl %eax, %eax + cmpl $16, %ecx + je L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop: + movb %al, (%esp,%ecx,1) + incl %ecx + cmpl $16, %ecx + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc: + vmovdqu (%esp), %xmm0 + addl $16, %esp + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm2, %xmm2 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm2, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm2, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm2, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm2, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm4, %xmm4 + vpxor %xmm5, %xmm7, %xmm2 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm2, %xmm2 +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_avx1_done_enc: + movl 148(%esp), %edi + movl 164(%esp), %ebx + movl 152(%esp), %edx + movl 156(%esp), %ecx + shll $3, %edx + shll $3, %ecx + vpinsrd $0x00, %edx, %xmm4, %xmm4 + vpinsrd $2, %ecx, %xmm4, %xmm4 + movl 152(%esp), %edx + movl 156(%esp), %ecx + shrl $29, %edx + shrl $29, %ecx + vpinsrd $0x01, %edx, %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm2, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm2, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm2, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm2, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm4, %xmm4 + vpxor %xmm5, %xmm7, %xmm2 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm2, %xmm2 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm2, %xmm2 + vpxor 80(%esp), %xmm2, %xmm4 + cmpl $16, %ebx + je L_AES_GCM_encrypt_avx1_store_tag_16 + xorl %ecx, %ecx + vmovdqu %xmm4, (%esp) +L_AES_GCM_encrypt_avx1_store_tag_loop: + movzbl (%esp,%ecx,1), %eax + movb %al, (%edi,%ecx,1) + incl %ecx + cmpl %ebx, %ecx + jne L_AES_GCM_encrypt_avx1_store_tag_loop + jmp L_AES_GCM_encrypt_avx1_store_tag_done +L_AES_GCM_encrypt_avx1_store_tag_16: + vmovdqu %xmm4, (%edi) +L_AES_GCM_encrypt_avx1_store_tag_done: + addl $0x70, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_encrypt_avx1,.-AES_GCM_encrypt_avx1 +.text +.globl AES_GCM_decrypt_avx1 +.type AES_GCM_decrypt_avx1,@function +.align 16 +AES_GCM_decrypt_avx1: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0xb0, %esp + movl 208(%esp), %esi + movl 232(%esp), %ebp + movl 224(%esp), %edx + vpxor %xmm0, %xmm0, %xmm0 + vpxor %xmm2, %xmm2, %xmm2 + cmpl $12, %edx + jne L_AES_GCM_decrypt_avx1_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vpinsrd $0x00, (%esi), %xmm0, %xmm0 + vpinsrd $0x01, 4(%esi), %xmm0, %xmm0 + vpinsrd $2, 8(%esi), %xmm0, %xmm0 + vpinsrd $3, %ecx, %xmm0, %xmm0 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%ebp), %xmm1 + vpxor %xmm1, %xmm0, %xmm5 + vmovdqa 16(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 32(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 48(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 64(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 80(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 96(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 112(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 128(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 144(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + cmpl $11, 236(%esp) + vmovdqa 160(%ebp), %xmm3 + jl L_AES_GCM_decrypt_avx1_calc_iv_12_last + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 176(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + cmpl $13, 236(%esp) + vmovdqa 192(%ebp), %xmm3 + jl L_AES_GCM_decrypt_avx1_calc_iv_12_last + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 208(%ebp), %xmm3 + vaesenc %xmm3, %xmm1, %xmm1 + vaesenc %xmm3, %xmm5, %xmm5 + vmovdqa 224(%ebp), %xmm3 +L_AES_GCM_decrypt_avx1_calc_iv_12_last: + vaesenclast %xmm3, %xmm1, %xmm1 + vaesenclast %xmm3, %xmm5, %xmm5 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1 + vmovdqu %xmm5, 80(%esp) + jmp L_AES_GCM_decrypt_avx1_iv_done +L_AES_GCM_decrypt_avx1_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%ebp), %xmm1 + vaesenc 16(%ebp), %xmm1, %xmm1 + vaesenc 32(%ebp), %xmm1, %xmm1 + vaesenc 48(%ebp), %xmm1, %xmm1 + vaesenc 64(%ebp), %xmm1, %xmm1 + vaesenc 80(%ebp), %xmm1, %xmm1 + vaesenc 96(%ebp), %xmm1, %xmm1 + vaesenc 112(%ebp), %xmm1, %xmm1 + vaesenc 128(%ebp), %xmm1, %xmm1 + vaesenc 144(%ebp), %xmm1, %xmm1 + cmpl $11, 236(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc %xmm5, %xmm1, %xmm1 + vaesenc 176(%ebp), %xmm1, %xmm1 + cmpl $13, 236(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc %xmm5, %xmm1, %xmm1 + vaesenc 208(%ebp), %xmm1, %xmm1 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm5, %xmm1, %xmm1 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_decrypt_avx1_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_decrypt_avx1_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_avx1_calc_iv_16_loop: + vmovdqu (%esi,%ecx,1), %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm0, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm0 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm0, %xmm0 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm0, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm0, %xmm0 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm0, %xmm0 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm0, %xmm0 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm0, %xmm0 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx1_calc_iv_16_loop + movl 224(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_avx1_calc_iv_done +L_AES_GCM_decrypt_avx1_calc_iv_lt16: + subl $16, %esp + vpxor %xmm4, %xmm4, %xmm4 + xorl %ebx, %ebx + vmovdqu %xmm4, (%esp) +L_AES_GCM_decrypt_avx1_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx1_calc_iv_loop + vmovdqu (%esp), %xmm4 + addl $16, %esp + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm0, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm0 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm0, %xmm0 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm0, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm0, %xmm0 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm0, %xmm0 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm0, %xmm0 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm0, %xmm0 +L_AES_GCM_decrypt_avx1_calc_iv_done: + # T = Encrypt counter + vpxor %xmm4, %xmm4, %xmm4 + shll $3, %edx + vpinsrd $0x00, %edx, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm0, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm0 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm0, %xmm0 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm0, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm0, %xmm0 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm0, %xmm0 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm0, %xmm0 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm0, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + # Encrypt counter + vmovdqa (%ebp), %xmm4 + vpxor %xmm0, %xmm4, %xmm4 + vaesenc 16(%ebp), %xmm4, %xmm4 + vaesenc 32(%ebp), %xmm4, %xmm4 + vaesenc 48(%ebp), %xmm4, %xmm4 + vaesenc 64(%ebp), %xmm4, %xmm4 + vaesenc 80(%ebp), %xmm4, %xmm4 + vaesenc 96(%ebp), %xmm4, %xmm4 + vaesenc 112(%ebp), %xmm4, %xmm4 + vaesenc 128(%ebp), %xmm4, %xmm4 + vaesenc 144(%ebp), %xmm4, %xmm4 + cmpl $11, 236(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 176(%ebp), %xmm4, %xmm4 + cmpl $13, 236(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 208(%ebp), %xmm4, %xmm4 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm5, %xmm4, %xmm4 + vmovdqu %xmm4, 80(%esp) +L_AES_GCM_decrypt_avx1_iv_done: + movl 204(%esp), %esi + # Additional authentication data + movl 220(%esp), %edx + cmpl $0x00, %edx + je L_AES_GCM_decrypt_avx1_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_decrypt_avx1_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_avx1_calc_aad_16_loop: + vmovdqu (%esi,%ecx,1), %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm2, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm4 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm2 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm2, %xmm2 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm2, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm2, %xmm2 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm2, %xmm2 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm2, %xmm2 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx1_calc_aad_16_loop + movl 220(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_avx1_calc_aad_done +L_AES_GCM_decrypt_avx1_calc_aad_lt16: + subl $16, %esp + vpxor %xmm4, %xmm4, %xmm4 + xorl %ebx, %ebx + vmovdqu %xmm4, (%esp) +L_AES_GCM_decrypt_avx1_calc_aad_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx1_calc_aad_loop + vmovdqu (%esp), %xmm4 + addl $16, %esp + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm2, %xmm5 + vpshufd $0x4e, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm4 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqa %xmm4, %xmm3 + vmovdqa %xmm7, %xmm2 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm5, %xmm2, %xmm2 + vpsrld $31, %xmm3, %xmm4 + vpsrld $31, %xmm2, %xmm5 + vpslld $0x01, %xmm3, %xmm3 + vpslld $0x01, %xmm2, %xmm2 + vpsrldq $12, %xmm4, %xmm6 + vpslldq $4, %xmm4, %xmm4 + vpslldq $4, %xmm5, %xmm5 + vpor %xmm6, %xmm2, %xmm2 + vpor %xmm4, %xmm3, %xmm3 + vpor %xmm5, %xmm2, %xmm2 + vpslld $31, %xmm3, %xmm4 + vpslld $30, %xmm3, %xmm5 + vpslld $25, %xmm3, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vmovdqa %xmm4, %xmm5 + vpsrldq $4, %xmm5, %xmm5 + vpslldq $12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpsrld $0x01, %xmm3, %xmm6 + vpsrld $2, %xmm3, %xmm7 + vpsrld $7, %xmm3, %xmm4 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm6, %xmm2, %xmm2 +L_AES_GCM_decrypt_avx1_calc_aad_done: + vmovdqu %xmm2, 96(%esp) + movl 196(%esp), %esi + movl 200(%esp), %edi + # Calculate counter and H + vpsrlq $63, %xmm1, %xmm5 + vpsllq $0x01, %xmm1, %xmm4 + vpslldq $8, %xmm5, %xmm5 + vpor %xmm5, %xmm4, %xmm4 + vpshufd $0xff, %xmm1, %xmm1 + vpsrad $31, %xmm1, %xmm1 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0 + vpand L_aes_gcm_avx1_mod2_128, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm0, 64(%esp) + xorl %ebx, %ebx + cmpl $0x40, 216(%esp) + movl 216(%esp), %eax + jl L_AES_GCM_decrypt_avx1_done_64 + andl $0xffffffc0, %eax + vmovdqa %xmm2, %xmm6 + # H ^ 1 + vmovdqu %xmm1, (%esp) + # H ^ 2 + vpclmulqdq $0x00, %xmm1, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm1, %xmm1, %xmm0 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm0, %xmm0 + vmovdqu %xmm0, 16(%esp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm0, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm0, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm4, %xmm4 + vpxor %xmm5, %xmm7, %xmm3 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu %xmm3, 32(%esp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm3 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu %xmm3, 48(%esp) + cmpl %esi, %edi + jne L_AES_GCM_decrypt_avx1_ghash_64 +L_AES_GCM_decrypt_avx1_ghash_64_inplace: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm4 + vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3 + vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5 + vpshufb %xmm3, %xmm5, %xmm5 + vpaddd L_aes_gcm_avx1_two, %xmm4, %xmm6 + vpshufb %xmm3, %xmm6, %xmm6 + vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7 + vpshufb %xmm3, %xmm7, %xmm7 + vpshufb %xmm3, %xmm4, %xmm4 + vmovdqu 64(%esp), %xmm3 + vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3 + vmovdqu %xmm3, 64(%esp) + vmovdqa (%ebp), %xmm3 + vpxor %xmm3, %xmm4, %xmm4 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm3, %xmm7, %xmm7 + vmovdqa 16(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 32(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 48(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 64(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 80(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 96(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 112(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 128(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 144(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + cmpl $11, 236(%esp) + vmovdqa 160(%ebp), %xmm3 + jl L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 176(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + cmpl $13, 236(%esp) + vmovdqa 192(%ebp), %xmm3 + jl L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 208(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 224(%ebp), %xmm3 +L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast %xmm3, %xmm4, %xmm4 + vaesenclast %xmm3, %xmm5, %xmm5 + vmovdqu (%ecx), %xmm0 + vmovdqu 16(%ecx), %xmm1 + vpxor %xmm0, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vmovdqu %xmm0, 112(%esp) + vmovdqu %xmm1, 128(%esp) + vmovdqu %xmm4, (%edx) + vmovdqu %xmm5, 16(%edx) + vaesenclast %xmm3, %xmm6, %xmm6 + vaesenclast %xmm3, %xmm7, %xmm7 + vmovdqu 32(%ecx), %xmm0 + vmovdqu 48(%ecx), %xmm1 + vpxor %xmm0, %xmm6, %xmm6 + vpxor %xmm1, %xmm7, %xmm7 + vmovdqu %xmm0, 144(%esp) + vmovdqu %xmm1, 160(%esp) + vmovdqu %xmm6, 32(%edx) + vmovdqu %xmm7, 48(%edx) + # ghash encrypted counter + vmovdqu 96(%esp), %xmm6 + vmovdqu 48(%esp), %xmm3 + vmovdqu 112(%esp), %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpshufd $0x4e, %xmm3, %xmm5 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm7 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm6 + vpclmulqdq $0x00, %xmm1, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqu 32(%esp), %xmm3 + vmovdqu 128(%esp), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vmovdqu 16(%esp), %xmm3 + vmovdqu 144(%esp), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vmovdqu (%esp), %xmm3 + vmovdqu 160(%esp), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpslld $31, %xmm6, %xmm3 + vpslld $30, %xmm6, %xmm0 + vpslld $25, %xmm6, %xmm1 + vpxor %xmm0, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpsrldq $4, %xmm3, %xmm0 + vpslldq $12, %xmm3, %xmm3 + vpxor %xmm3, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm1 + vpsrld $2, %xmm6, %xmm5 + vpsrld $7, %xmm6, %xmm4 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu %xmm6, 96(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_avx1_ghash_64_inplace + jmp L_AES_GCM_decrypt_avx1_ghash_64_done +L_AES_GCM_decrypt_avx1_ghash_64: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm4 + vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3 + vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5 + vpshufb %xmm3, %xmm5, %xmm5 + vpaddd L_aes_gcm_avx1_two, %xmm4, %xmm6 + vpshufb %xmm3, %xmm6, %xmm6 + vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7 + vpshufb %xmm3, %xmm7, %xmm7 + vpshufb %xmm3, %xmm4, %xmm4 + vmovdqu 64(%esp), %xmm3 + vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3 + vmovdqu %xmm3, 64(%esp) + vmovdqa (%ebp), %xmm3 + vpxor %xmm3, %xmm4, %xmm4 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm3, %xmm7, %xmm7 + vmovdqa 16(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 32(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 48(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 64(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 80(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 96(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 112(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 128(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 144(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + cmpl $11, 236(%esp) + vmovdqa 160(%ebp), %xmm3 + jl L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 176(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + cmpl $13, 236(%esp) + vmovdqa 192(%ebp), %xmm3 + jl L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 208(%ebp), %xmm3 + vaesenc %xmm3, %xmm4, %xmm4 + vaesenc %xmm3, %xmm5, %xmm5 + vaesenc %xmm3, %xmm6, %xmm6 + vaesenc %xmm3, %xmm7, %xmm7 + vmovdqa 224(%ebp), %xmm3 +L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast %xmm3, %xmm4, %xmm4 + vaesenclast %xmm3, %xmm5, %xmm5 + vmovdqu (%ecx), %xmm0 + vmovdqu 16(%ecx), %xmm1 + vpxor %xmm0, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vmovdqu %xmm0, (%ecx) + vmovdqu %xmm1, 16(%ecx) + vmovdqu %xmm4, (%edx) + vmovdqu %xmm5, 16(%edx) + vaesenclast %xmm3, %xmm6, %xmm6 + vaesenclast %xmm3, %xmm7, %xmm7 + vmovdqu 32(%ecx), %xmm0 + vmovdqu 48(%ecx), %xmm1 + vpxor %xmm0, %xmm6, %xmm6 + vpxor %xmm1, %xmm7, %xmm7 + vmovdqu %xmm0, 32(%ecx) + vmovdqu %xmm1, 48(%ecx) + vmovdqu %xmm6, 32(%edx) + vmovdqu %xmm7, 48(%edx) + # ghash encrypted counter + vmovdqu 96(%esp), %xmm6 + vmovdqu 48(%esp), %xmm3 + vmovdqu (%ecx), %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpshufd $0x4e, %xmm3, %xmm5 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm7 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm6 + vpclmulqdq $0x00, %xmm1, %xmm5, %xmm5 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vmovdqu 32(%esp), %xmm3 + vmovdqu 16(%ecx), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vmovdqu 16(%esp), %xmm3 + vmovdqu 32(%ecx), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vmovdqu (%esp), %xmm3 + vmovdqu 48(%ecx), %xmm4 + vpshufd $0x4e, %xmm3, %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm3, %xmm4, %xmm2 + vpclmulqdq $0x00, %xmm3, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm1, %xmm0, %xmm0 + vpxor %xmm3, %xmm5, %xmm5 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm0, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpslld $31, %xmm6, %xmm3 + vpslld $30, %xmm6, %xmm0 + vpslld $25, %xmm6, %xmm1 + vpxor %xmm0, %xmm3, %xmm3 + vpxor %xmm1, %xmm3, %xmm3 + vpsrldq $4, %xmm3, %xmm0 + vpslldq $12, %xmm3, %xmm3 + vpxor %xmm3, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm1 + vpsrld $2, %xmm6, %xmm5 + vpsrld $7, %xmm6, %xmm4 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu %xmm6, 96(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_avx1_ghash_64 +L_AES_GCM_decrypt_avx1_ghash_64_done: + vmovdqa %xmm6, %xmm2 + vmovdqu (%esp), %xmm1 +L_AES_GCM_decrypt_avx1_done_64: + movl 216(%esp), %edx + cmpl %edx, %ebx + jge L_AES_GCM_decrypt_avx1_done_dec + movl 216(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_decrypt_avx1_last_block_done +L_AES_GCM_decrypt_avx1_last_block_start: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu (%ecx), %xmm7 + pshufb L_aes_gcm_avx1_bswap_mask, %xmm7 + pxor %xmm2, %xmm7 + vmovdqu 64(%esp), %xmm5 + vmovdqu %xmm7, %xmm7 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4 + vpaddd L_aes_gcm_avx1_one, %xmm5, %xmm5 + vmovdqu %xmm5, 64(%esp) + vpxor (%ebp), %xmm4, %xmm4 + vpclmulqdq $16, %xmm1, %xmm7, %xmm0 + vaesenc 16(%ebp), %xmm4, %xmm4 + vaesenc 32(%ebp), %xmm4, %xmm4 + vpclmulqdq $0x01, %xmm1, %xmm7, %xmm3 + vaesenc 48(%ebp), %xmm4, %xmm4 + vaesenc 64(%ebp), %xmm4, %xmm4 + vaesenc 80(%ebp), %xmm4, %xmm4 + vpclmulqdq $0x11, %xmm1, %xmm7, %xmm5 + vaesenc 96(%ebp), %xmm4, %xmm4 + vpxor %xmm3, %xmm0, %xmm0 + vpslldq $8, %xmm0, %xmm6 + vpsrldq $8, %xmm0, %xmm0 + vaesenc 112(%ebp), %xmm4, %xmm4 + vpclmulqdq $0x00, %xmm1, %xmm7, %xmm3 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm0, %xmm5, %xmm5 + vmovdqa L_aes_gcm_avx1_mod2_128, %xmm7 + vpclmulqdq $16, %xmm7, %xmm6, %xmm3 + vaesenc 128(%ebp), %xmm4, %xmm4 + vpshufd $0x4e, %xmm6, %xmm0 + vpxor %xmm3, %xmm0, %xmm0 + vpclmulqdq $16, %xmm7, %xmm0, %xmm3 + vaesenc 144(%ebp), %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm2 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm5, %xmm2, %xmm2 + cmpl $11, 236(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_decrypt_avx1_aesenc_gfmul_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 176(%ebp), %xmm4, %xmm4 + cmpl $13, 236(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_decrypt_avx1_aesenc_gfmul_last + vaesenc %xmm5, %xmm4, %xmm4 + vaesenc 208(%ebp), %xmm4, %xmm4 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_decrypt_avx1_aesenc_gfmul_last: + vaesenclast %xmm5, %xmm4, %xmm4 + vmovdqu (%ecx), %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vmovdqu %xmm4, (%edx) + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_avx1_last_block_start +L_AES_GCM_decrypt_avx1_last_block_done: + movl 216(%esp), %ecx + movl %ecx, %edx + andl $15, %ecx + jz L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done + vmovdqu 64(%esp), %xmm0 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0 + vpxor (%ebp), %xmm0, %xmm0 + vaesenc 16(%ebp), %xmm0, %xmm0 + vaesenc 32(%ebp), %xmm0, %xmm0 + vaesenc 48(%ebp), %xmm0, %xmm0 + vaesenc 64(%ebp), %xmm0, %xmm0 + vaesenc 80(%ebp), %xmm0, %xmm0 + vaesenc 96(%ebp), %xmm0, %xmm0 + vaesenc 112(%ebp), %xmm0, %xmm0 + vaesenc 128(%ebp), %xmm0, %xmm0 + vaesenc 144(%ebp), %xmm0, %xmm0 + cmpl $11, 236(%esp) + vmovdqa 160(%ebp), %xmm5 + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm5, %xmm0, %xmm0 + vaesenc 176(%ebp), %xmm0, %xmm0 + cmpl $13, 236(%esp) + vmovdqa 192(%ebp), %xmm5 + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm5, %xmm0, %xmm0 + vaesenc 208(%ebp), %xmm0, %xmm0 + vmovdqa 224(%ebp), %xmm5 +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast %xmm5, %xmm0, %xmm0 + subl $32, %esp + xorl %ecx, %ecx + vmovdqu %xmm0, (%esp) + vpxor %xmm4, %xmm4, %xmm4 + vmovdqu %xmm4, 16(%esp) +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop: + movzbl (%esi,%ebx,1), %eax + movb %al, 16(%esp,%ecx,1) + xorb (%esp,%ecx,1), %al + movb %al, (%edi,%ebx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop + vmovdqu 16(%esp), %xmm0 + addl $32, %esp + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm2, %xmm2 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm2, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm2, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm2, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm2, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm4, %xmm4 + vpxor %xmm5, %xmm7, %xmm2 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm2, %xmm2 +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_avx1_done_dec: + movl 212(%esp), %esi + movl 228(%esp), %ebp + movl 216(%esp), %edx + movl 220(%esp), %ecx + shll $3, %edx + shll $3, %ecx + vpinsrd $0x00, %edx, %xmm4, %xmm4 + vpinsrd $2, %ecx, %xmm4, %xmm4 + movl 216(%esp), %edx + movl 220(%esp), %ecx + shrl $29, %edx + shrl $29, %ecx + vpinsrd $0x01, %edx, %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm1, %xmm5 + vpshufd $0x4e, %xmm2, %xmm6 + vpclmulqdq $0x11, %xmm1, %xmm2, %xmm7 + vpclmulqdq $0x00, %xmm1, %xmm2, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm2, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm6 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm6, %xmm4, %xmm4 + vpxor %xmm5, %xmm7, %xmm2 + vpslld $31, %xmm4, %xmm5 + vpslld $30, %xmm4, %xmm6 + vpslld $25, %xmm4, %xmm7 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm7, %xmm5, %xmm5 + vpsrldq $4, %xmm5, %xmm7 + vpslldq $12, %xmm5, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm5 + vpsrld $2, %xmm4, %xmm6 + vpxor %xmm6, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpsrld $7, %xmm4, %xmm4 + vpxor %xmm7, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm2, %xmm2 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm2, %xmm2 + vpxor 80(%esp), %xmm2, %xmm4 + movl 240(%esp), %edi + cmpl $16, %ebp + je L_AES_GCM_decrypt_avx1_cmp_tag_16 + subl $16, %esp + xorl %ecx, %ecx + xorl %ebx, %ebx + vmovdqu %xmm4, (%esp) +L_AES_GCM_decrypt_avx1_cmp_tag_loop: + movzbl (%esp,%ecx,1), %eax + xorb (%esi,%ecx,1), %al + orb %al, %bl + incl %ecx + cmpl %ebp, %ecx + jne L_AES_GCM_decrypt_avx1_cmp_tag_loop + cmpb $0x00, %bl + sete %bl + addl $16, %esp + xorl %ecx, %ecx + jmp L_AES_GCM_decrypt_avx1_cmp_tag_done +L_AES_GCM_decrypt_avx1_cmp_tag_16: + vmovdqu (%esi), %xmm5 + vpcmpeqb %xmm5, %xmm4, %xmm4 + vpmovmskb %xmm4, %edx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ebx, %ebx + cmpl $0xffff, %edx + sete %bl +L_AES_GCM_decrypt_avx1_cmp_tag_done: + movl %ebx, (%edi) + addl $0xb0, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_avx1,.-AES_GCM_decrypt_avx1 +#ifdef WOLFSSL_AESGCM_STREAM +.text +.globl AES_GCM_init_avx1 +.type AES_GCM_init_avx1,@function +.align 16 +AES_GCM_init_avx1: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $16, %esp + movl 36(%esp), %ebp + movl 44(%esp), %esi + movl 60(%esp), %edi + vpxor %xmm4, %xmm4, %xmm4 + movl 48(%esp), %edx + cmpl $12, %edx + jne L_AES_GCM_init_avx1_iv_not_12 + # # Calculate values when IV is 12 bytes + # Set counter based on IV + movl $0x1000000, %ecx + vpinsrd $0x00, (%esi), %xmm4, %xmm4 + vpinsrd $0x01, 4(%esi), %xmm4, %xmm4 + vpinsrd $2, 8(%esi), %xmm4, %xmm4 + vpinsrd $3, %ecx, %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqa (%ebp), %xmm5 + vpxor %xmm5, %xmm4, %xmm1 + vmovdqa 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $11, 40(%esp) + vmovdqa 160(%ebp), %xmm7 + jl L_AES_GCM_init_avx1_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + cmpl $13, 40(%esp) + vmovdqa 192(%ebp), %xmm7 + jl L_AES_GCM_init_avx1_calc_iv_12_last + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm1, %xmm1 + vmovdqa 224(%ebp), %xmm7 +L_AES_GCM_init_avx1_calc_iv_12_last: + vaesenclast %xmm7, %xmm5, %xmm5 + vaesenclast %xmm7, %xmm1, %xmm1 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm5, %xmm5 + vmovdqu %xmm1, (%edi) + jmp L_AES_GCM_init_avx1_iv_done +L_AES_GCM_init_avx1_iv_not_12: + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqa (%ebp), %xmm5 + vaesenc 16(%ebp), %xmm5, %xmm5 + vaesenc 32(%ebp), %xmm5, %xmm5 + vaesenc 48(%ebp), %xmm5, %xmm5 + vaesenc 64(%ebp), %xmm5, %xmm5 + vaesenc 80(%ebp), %xmm5, %xmm5 + vaesenc 96(%ebp), %xmm5, %xmm5 + vaesenc 112(%ebp), %xmm5, %xmm5 + vaesenc 128(%ebp), %xmm5, %xmm5 + vaesenc 144(%ebp), %xmm5, %xmm5 + cmpl $11, 40(%esp) + vmovdqa 160(%ebp), %xmm1 + jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last + vaesenc %xmm1, %xmm5, %xmm5 + vaesenc 176(%ebp), %xmm5, %xmm5 + cmpl $13, 40(%esp) + vmovdqa 192(%ebp), %xmm1 + jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last + vaesenc %xmm1, %xmm5, %xmm5 + vaesenc 208(%ebp), %xmm5, %xmm5 + vmovdqa 224(%ebp), %xmm1 +L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm1, %xmm5, %xmm5 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_init_avx1_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_init_avx1_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_init_avx1_calc_iv_16_loop: + vmovdqu (%esi,%ecx,1), %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_init_avx1_calc_iv_16_loop + movl 48(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_init_avx1_calc_iv_done +L_AES_GCM_init_avx1_calc_iv_lt16: + subl $16, %esp + vpxor %xmm0, %xmm0, %xmm0 + xorl %ebx, %ebx + vmovdqu %xmm0, (%esp) +L_AES_GCM_init_avx1_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_init_avx1_calc_iv_loop + vmovdqu (%esp), %xmm0 + addl $16, %esp + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 +L_AES_GCM_init_avx1_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vpinsrd $0x00, %edx, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm7 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm7, %xmm7 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm7, %xmm0 + vpslld $30, %xmm7, %xmm1 + vpslld $25, %xmm7, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm7, %xmm7 + vpsrld $0x01, %xmm7, %xmm2 + vpsrld $2, %xmm7, %xmm3 + vpsrld $7, %xmm7, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + # Encrypt counter + vmovdqa (%ebp), %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vaesenc 16(%ebp), %xmm0, %xmm0 + vaesenc 32(%ebp), %xmm0, %xmm0 + vaesenc 48(%ebp), %xmm0, %xmm0 + vaesenc 64(%ebp), %xmm0, %xmm0 + vaesenc 80(%ebp), %xmm0, %xmm0 + vaesenc 96(%ebp), %xmm0, %xmm0 + vaesenc 112(%ebp), %xmm0, %xmm0 + vaesenc 128(%ebp), %xmm0, %xmm0 + vaesenc 144(%ebp), %xmm0, %xmm0 + cmpl $11, 40(%esp) + vmovdqa 160(%ebp), %xmm1 + jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%ebp), %xmm0, %xmm0 + cmpl $13, 40(%esp) + vmovdqa 192(%ebp), %xmm1 + jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%ebp), %xmm0, %xmm0 + vmovdqa 224(%ebp), %xmm1 +L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%edi) +L_AES_GCM_init_avx1_iv_done: + movl 52(%esp), %ebp + movl 56(%esp), %edi + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm4, %xmm4 + vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm4 + vmovdqa %xmm5, (%ebp) + vmovdqa %xmm4, (%edi) + addl $16, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_init_avx1,.-AES_GCM_init_avx1 +.text +.globl AES_GCM_aad_update_avx1 +.type AES_GCM_aad_update_avx1,@function +.align 16 +AES_GCM_aad_update_avx1: + pushl %esi + pushl %edi + movl 12(%esp), %esi + movl 16(%esp), %edx + movl 20(%esp), %edi + movl 24(%esp), %eax + vmovdqa (%edi), %xmm5 + vmovdqa (%eax), %xmm6 + xorl %ecx, %ecx +L_AES_GCM_aad_update_avx1_16_loop: + vmovdqu (%esi,%ecx,1), %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm5, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm5 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm5, %xmm5 + vpsrld $31, %xmm4, %xmm0 + vpsrld $31, %xmm5, %xmm1 + vpslld $0x01, %xmm4, %xmm4 + vpslld $0x01, %xmm5, %xmm5 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm5, %xmm5 + vpor %xmm0, %xmm4, %xmm4 + vpor %xmm1, %xmm5, %xmm5 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm5, %xmm5 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_aad_update_avx1_16_loop + vmovdqa %xmm5, (%edi) + popl %edi + popl %esi + ret +.size AES_GCM_aad_update_avx1,.-AES_GCM_aad_update_avx1 +.text +.globl AES_GCM_encrypt_block_avx1 +.type AES_GCM_encrypt_block_avx1,@function +.align 16 +AES_GCM_encrypt_block_avx1: + pushl %esi + pushl %edi + movl 12(%esp), %ecx + movl 16(%esp), %eax + movl 20(%esp), %edi + movl 24(%esp), %esi + movl 28(%esp), %edx + vmovdqu (%edx), %xmm1 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0 + vpaddd L_aes_gcm_avx1_one, %xmm1, %xmm1 + vmovdqu %xmm1, (%edx) + vpxor (%ecx), %xmm0, %xmm0 + vaesenc 16(%ecx), %xmm0, %xmm0 + vaesenc 32(%ecx), %xmm0, %xmm0 + vaesenc 48(%ecx), %xmm0, %xmm0 + vaesenc 64(%ecx), %xmm0, %xmm0 + vaesenc 80(%ecx), %xmm0, %xmm0 + vaesenc 96(%ecx), %xmm0, %xmm0 + vaesenc 112(%ecx), %xmm0, %xmm0 + vaesenc 128(%ecx), %xmm0, %xmm0 + vaesenc 144(%ecx), %xmm0, %xmm0 + cmpl $11, %eax + vmovdqa 160(%ecx), %xmm1 + jl L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%ecx), %xmm0, %xmm0 + cmpl $13, %eax + vmovdqa 192(%ecx), %xmm1 + jl L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%ecx), %xmm0, %xmm0 + vmovdqa 224(%ecx), %xmm1 +L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%esi), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%edi) + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + popl %edi + popl %esi + ret +.size AES_GCM_encrypt_block_avx1,.-AES_GCM_encrypt_block_avx1 +.text +.globl AES_GCM_ghash_block_avx1 +.type AES_GCM_ghash_block_avx1,@function +.align 16 +AES_GCM_ghash_block_avx1: + movl 4(%esp), %edx + movl 8(%esp), %eax + movl 12(%esp), %ecx + vmovdqa (%eax), %xmm4 + vmovdqa (%ecx), %xmm5 + vmovdqu (%edx), %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm4, %xmm1 + vpshufd $0x4e, %xmm5, %xmm2 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm6 + vmovdqa %xmm3, %xmm4 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm4, %xmm4 + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + vpslld $31, %xmm6, %xmm0 + vpslld $30, %xmm6, %xmm1 + vpslld $25, %xmm6, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + vpsrld $0x01, %xmm6, %xmm2 + vpsrld $2, %xmm6, %xmm3 + vpsrld $7, %xmm6, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm6, %xmm2, %xmm2 + vpxor %xmm2, %xmm4, %xmm4 + vmovdqa %xmm4, (%eax) + ret +.size AES_GCM_ghash_block_avx1,.-AES_GCM_ghash_block_avx1 +.text +.globl AES_GCM_encrypt_update_avx1 +.type AES_GCM_encrypt_update_avx1,@function +.align 16 +AES_GCM_encrypt_update_avx1: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0x60, %esp + movl 144(%esp), %esi + vmovdqa (%esi), %xmm4 + vmovdqu %xmm4, 64(%esp) + movl 136(%esp), %esi + movl 140(%esp), %ebp + vmovdqa (%esi), %xmm6 + vmovdqa (%ebp), %xmm5 + vmovdqu %xmm6, 80(%esp) + movl 116(%esp), %ebp + movl 124(%esp), %edi + movl 128(%esp), %esi + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + xorl %ebx, %ebx + cmpl $0x40, 132(%esp) + movl 132(%esp), %eax + jl L_AES_GCM_encrypt_update_avx1_done_64 + andl $0xffffffc0, %eax + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%esp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm4 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + vmovdqu %xmm4, 16(%esp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm4, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm1, %xmm3, %xmm7 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + vmovdqu %xmm7, 32(%esp) + # H ^ 4 + vpclmulqdq $0x00, %xmm4, %xmm4, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm4, %xmm7 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + vmovdqu %xmm7, 48(%esp) + # First 64 bytes of input + vmovdqu 64(%esp), %xmm0 + vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx1_two, %xmm0, %xmm2 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3 + vpshufb %xmm7, %xmm3, %xmm3 + vpshufb %xmm7, %xmm0, %xmm0 + vmovdqu 64(%esp), %xmm7 + vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7 + vmovdqu %xmm7, 64(%esp) + vmovdqa (%ebp), %xmm7 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqa 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 120(%esp) + vmovdqa 160(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 120(%esp) + vmovdqa 192(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 224(%ebp), %xmm7 +L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done: + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vmovdqu (%esi), %xmm4 + vmovdqu 16(%esi), %xmm5 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vmovdqu %xmm4, (%esi) + vmovdqu %xmm5, 16(%esi) + vmovdqu %xmm0, (%edi) + vmovdqu %xmm1, 16(%edi) + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu 32(%esi), %xmm4 + vmovdqu 48(%esi), %xmm5 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu %xmm4, 32(%esi) + vmovdqu %xmm5, 48(%esi) + vmovdqu %xmm2, 32(%edi) + vmovdqu %xmm3, 48(%edi) + cmpl $0x40, %eax + movl $0x40, %ebx + movl %esi, %ecx + movl %edi, %edx + jle L_AES_GCM_encrypt_update_avx1_end_64 + # More 64 bytes of input +L_AES_GCM_encrypt_update_avx1_ghash_64: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm0 + vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx1_two, %xmm0, %xmm2 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3 + vpshufb %xmm7, %xmm3, %xmm3 + vpshufb %xmm7, %xmm0, %xmm0 + vmovdqu 64(%esp), %xmm7 + vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7 + vmovdqu %xmm7, 64(%esp) + vmovdqa (%ebp), %xmm7 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqa 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 120(%esp) + vmovdqa 160(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 120(%esp) + vmovdqa 192(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 224(%ebp), %xmm7 +L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vmovdqu (%ecx), %xmm4 + vmovdqu 16(%ecx), %xmm5 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ecx), %xmm4 + vmovdqu 48(%ecx), %xmm5 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # ghash encrypted counter + vmovdqu 80(%esp), %xmm2 + vmovdqu 48(%esp), %xmm7 + vmovdqu -64(%edx), %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm3 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm2 + vpclmulqdq $0x00, %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqu 32(%esp), %xmm7 + vmovdqu -48(%edx), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu 16(%esp), %xmm7 + vmovdqu -32(%edx), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu (%esp), %xmm7 + vmovdqu -16(%edx), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vpslldq $8, %xmm1, %xmm5 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $31, %xmm2, %xmm7 + vpslld $30, %xmm2, %xmm4 + vpslld $25, %xmm2, %xmm5 + vpxor %xmm4, %xmm7, %xmm7 + vpxor %xmm5, %xmm7, %xmm7 + vpsrldq $4, %xmm7, %xmm4 + vpslldq $12, %xmm7, %xmm7 + vpxor %xmm7, %xmm2, %xmm2 + vpsrld $0x01, %xmm2, %xmm5 + vpsrld $2, %xmm2, %xmm1 + vpsrld $7, %xmm2, %xmm0 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm3, %xmm2, %xmm2 + vmovdqu %xmm2, 80(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_update_avx1_ghash_64 +L_AES_GCM_encrypt_update_avx1_end_64: + movdqu 80(%esp), %xmm6 + # Block 1 + vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm0 + vmovdqu (%edx), %xmm5 + pshufb %xmm0, %xmm5 + vmovdqu 48(%esp), %xmm7 + pxor %xmm6, %xmm5 + # ghash_gfmul_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm7, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm7, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm7, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqa %xmm0, %xmm4 + vmovdqa %xmm3, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm6, %xmm6 + # Block 2 + vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm0 + vmovdqu 16(%edx), %xmm5 + pshufb %xmm0, %xmm5 + vmovdqu 32(%esp), %xmm7 + # ghash_gfmul_xor_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm7, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm7, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm7, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm4, %xmm4 + vpxor %xmm3, %xmm6, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm6, %xmm6 + # Block 3 + vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm0 + vmovdqu 32(%edx), %xmm5 + pshufb %xmm0, %xmm5 + vmovdqu 16(%esp), %xmm7 + # ghash_gfmul_xor_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm7, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm7, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm7, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm4, %xmm4 + vpxor %xmm3, %xmm6, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm6, %xmm6 + # Block 4 + vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm0 + vmovdqu 48(%edx), %xmm5 + pshufb %xmm0, %xmm5 + vmovdqu (%esp), %xmm7 + # ghash_gfmul_xor_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm7, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm7, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm7, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm4, %xmm4 + vpxor %xmm3, %xmm6, %xmm6 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm4, %xmm4 + vpxor %xmm1, %xmm6, %xmm6 + vpslld $31, %xmm4, %xmm0 + vpslld $30, %xmm4, %xmm1 + vpslld $25, %xmm4, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vmovdqa %xmm0, %xmm1 + vpsrldq $4, %xmm1, %xmm1 + vpslldq $12, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + vpsrld $0x01, %xmm4, %xmm2 + vpsrld $2, %xmm4, %xmm3 + vpsrld $7, %xmm4, %xmm0 + vpxor %xmm3, %xmm2, %xmm2 + vpxor %xmm0, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 + vmovdqu (%esp), %xmm5 +L_AES_GCM_encrypt_update_avx1_done_64: + movl 132(%esp), %edx + cmpl %edx, %ebx + jge L_AES_GCM_encrypt_update_avx1_done_enc + movl 132(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_update_avx1_last_block_done + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm1 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0 + vpaddd L_aes_gcm_avx1_one, %xmm1, %xmm1 + vmovdqu %xmm1, 64(%esp) + vpxor (%ebp), %xmm0, %xmm0 + vaesenc 16(%ebp), %xmm0, %xmm0 + vaesenc 32(%ebp), %xmm0, %xmm0 + vaesenc 48(%ebp), %xmm0, %xmm0 + vaesenc 64(%ebp), %xmm0, %xmm0 + vaesenc 80(%ebp), %xmm0, %xmm0 + vaesenc 96(%ebp), %xmm0, %xmm0 + vaesenc 112(%ebp), %xmm0, %xmm0 + vaesenc 128(%ebp), %xmm0, %xmm0 + vaesenc 144(%ebp), %xmm0, %xmm0 + cmpl $11, 120(%esp) + vmovdqa 160(%ebp), %xmm1 + jl L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%ebp), %xmm0, %xmm0 + cmpl $13, 120(%esp) + vmovdqa 192(%ebp), %xmm1 + jl L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%ebp), %xmm0, %xmm0 + vmovdqa 224(%ebp), %xmm1 +L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%ecx), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%edx) + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + addl $16, %ebx + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_update_avx1_last_block_ghash +L_AES_GCM_encrypt_update_avx1_last_block_start: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm1 + vmovdqu %xmm6, %xmm3 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0 + vpaddd L_aes_gcm_avx1_one, %xmm1, %xmm1 + vmovdqu %xmm1, 64(%esp) + vpxor (%ebp), %xmm0, %xmm0 + vpclmulqdq $16, %xmm5, %xmm3, %xmm4 + vaesenc 16(%ebp), %xmm0, %xmm0 + vaesenc 32(%ebp), %xmm0, %xmm0 + vpclmulqdq $0x01, %xmm5, %xmm3, %xmm7 + vaesenc 48(%ebp), %xmm0, %xmm0 + vaesenc 64(%ebp), %xmm0, %xmm0 + vaesenc 80(%ebp), %xmm0, %xmm0 + vpclmulqdq $0x11, %xmm5, %xmm3, %xmm1 + vaesenc 96(%ebp), %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpslldq $8, %xmm4, %xmm2 + vpsrldq $8, %xmm4, %xmm4 + vaesenc 112(%ebp), %xmm0, %xmm0 + vpclmulqdq $0x00, %xmm5, %xmm3, %xmm7 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqa L_aes_gcm_avx1_mod2_128, %xmm3 + vpclmulqdq $16, %xmm3, %xmm2, %xmm7 + vaesenc 128(%ebp), %xmm0, %xmm0 + vpshufd $0x4e, %xmm2, %xmm4 + vpxor %xmm7, %xmm4, %xmm4 + vpclmulqdq $16, %xmm3, %xmm4, %xmm7 + vaesenc 144(%ebp), %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm1, %xmm6, %xmm6 + cmpl $11, 120(%esp) + vmovdqa 160(%ebp), %xmm1 + jl L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%ebp), %xmm0, %xmm0 + cmpl $13, 120(%esp) + vmovdqa 192(%ebp), %xmm1 + jl L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%ebp), %xmm0, %xmm0 + vmovdqa 224(%ebp), %xmm1 +L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%ecx), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%edx) + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + addl $16, %ebx + vpxor %xmm0, %xmm6, %xmm6 + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_update_avx1_last_block_start +L_AES_GCM_encrypt_update_avx1_last_block_ghash: + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm1, %xmm3, %xmm6 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 +L_AES_GCM_encrypt_update_avx1_last_block_done: +L_AES_GCM_encrypt_update_avx1_done_enc: + movl 136(%esp), %esi + movl 144(%esp), %edi + vmovdqu 64(%esp), %xmm4 + vmovdqa %xmm6, (%esi) + vmovdqu %xmm4, (%edi) + addl $0x60, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_encrypt_update_avx1,.-AES_GCM_encrypt_update_avx1 +.text +.globl AES_GCM_encrypt_final_avx1 +.type AES_GCM_encrypt_final_avx1,@function +.align 16 +AES_GCM_encrypt_final_avx1: + pushl %esi + pushl %edi + pushl %ebp + subl $16, %esp + movl 32(%esp), %ebp + movl 52(%esp), %esi + movl 56(%esp), %edi + vmovdqa (%ebp), %xmm4 + vmovdqa (%esi), %xmm5 + vmovdqa (%edi), %xmm6 + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + movl 44(%esp), %edx + movl 48(%esp), %ecx + shll $3, %edx + shll $3, %ecx + vpinsrd $0x00, %edx, %xmm0, %xmm0 + vpinsrd $2, %ecx, %xmm0, %xmm0 + movl 44(%esp), %edx + movl 48(%esp), %ecx + shrl $29, %edx + shrl $29, %ecx + vpinsrd $0x01, %edx, %xmm0, %xmm0 + vpinsrd $3, %ecx, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm4, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm1, %xmm3, %xmm4 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm0 + movl 36(%esp), %edi + cmpl $16, 40(%esp) + je L_AES_GCM_encrypt_final_avx1_store_tag_16 + xorl %ecx, %ecx + vmovdqu %xmm0, (%esp) +L_AES_GCM_encrypt_final_avx1_store_tag_loop: + movzbl (%esp,%ecx,1), %eax + movb %al, (%edi,%ecx,1) + incl %ecx + cmpl 40(%esp), %ecx + jne L_AES_GCM_encrypt_final_avx1_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx1_store_tag_done +L_AES_GCM_encrypt_final_avx1_store_tag_16: + vmovdqu %xmm0, (%edi) +L_AES_GCM_encrypt_final_avx1_store_tag_done: + addl $16, %esp + popl %ebp + popl %edi + popl %esi + ret +.size AES_GCM_encrypt_final_avx1,.-AES_GCM_encrypt_final_avx1 +.text +.globl AES_GCM_decrypt_update_avx1 +.type AES_GCM_decrypt_update_avx1,@function +.align 16 +AES_GCM_decrypt_update_avx1: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0xa0, %esp + movl 208(%esp), %esi + vmovdqa (%esi), %xmm4 + vmovdqu %xmm4, 64(%esp) + movl 200(%esp), %esi + movl 204(%esp), %ebp + vmovdqa (%esi), %xmm6 + vmovdqa (%ebp), %xmm5 + vmovdqu %xmm6, 80(%esp) + movl 180(%esp), %ebp + movl 188(%esp), %edi + movl 192(%esp), %esi + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + xorl %ebx, %ebx + cmpl $0x40, 196(%esp) + movl 196(%esp), %eax + jl L_AES_GCM_decrypt_update_avx1_done_64 + andl $0xffffffc0, %eax + vmovdqa %xmm6, %xmm2 + # H ^ 1 + vmovdqu %xmm5, (%esp) + # H ^ 2 + vpclmulqdq $0x00, %xmm5, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm5, %xmm5, %xmm4 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + vmovdqu %xmm4, 16(%esp) + # H ^ 3 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm4, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm1, %xmm3, %xmm7 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + vmovdqu %xmm7, 32(%esp) + # H ^ 4 + vpclmulqdq $0x00, %xmm4, %xmm4, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm4, %xmm7 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + vmovdqu %xmm7, 48(%esp) + cmpl %esi, %edi + jne L_AES_GCM_decrypt_update_avx1_ghash_64 +L_AES_GCM_decrypt_update_avx1_ghash_64_inplace: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm0 + vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx1_two, %xmm0, %xmm2 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3 + vpshufb %xmm7, %xmm3, %xmm3 + vpshufb %xmm7, %xmm0, %xmm0 + vmovdqu 64(%esp), %xmm7 + vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7 + vmovdqu %xmm7, 64(%esp) + vmovdqa (%ebp), %xmm7 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqa 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 184(%esp) + vmovdqa 160(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 184(%esp) + vmovdqa 192(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 224(%ebp), %xmm7 +L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vmovdqu (%ecx), %xmm4 + vmovdqu 16(%ecx), %xmm5 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vmovdqu %xmm4, 96(%esp) + vmovdqu %xmm5, 112(%esp) + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ecx), %xmm4 + vmovdqu 48(%ecx), %xmm5 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu %xmm4, 128(%esp) + vmovdqu %xmm5, 144(%esp) + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # ghash encrypted counter + vmovdqu 80(%esp), %xmm2 + vmovdqu 48(%esp), %xmm7 + vmovdqu 96(%esp), %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm3 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm2 + vpclmulqdq $0x00, %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqu 32(%esp), %xmm7 + vmovdqu 112(%esp), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu 16(%esp), %xmm7 + vmovdqu 128(%esp), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu (%esp), %xmm7 + vmovdqu 144(%esp), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vpslldq $8, %xmm1, %xmm5 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $31, %xmm2, %xmm7 + vpslld $30, %xmm2, %xmm4 + vpslld $25, %xmm2, %xmm5 + vpxor %xmm4, %xmm7, %xmm7 + vpxor %xmm5, %xmm7, %xmm7 + vpsrldq $4, %xmm7, %xmm4 + vpslldq $12, %xmm7, %xmm7 + vpxor %xmm7, %xmm2, %xmm2 + vpsrld $0x01, %xmm2, %xmm5 + vpsrld $2, %xmm2, %xmm1 + vpsrld $7, %xmm2, %xmm0 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm3, %xmm2, %xmm2 + vmovdqu %xmm2, 80(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_avx1_ghash_64_inplace + jmp L_AES_GCM_decrypt_update_avx1_ghash_64_done +L_AES_GCM_decrypt_update_avx1_ghash_64: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu 64(%esp), %xmm0 + vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx1_two, %xmm0, %xmm2 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3 + vpshufb %xmm7, %xmm3, %xmm3 + vpshufb %xmm7, %xmm0, %xmm0 + vmovdqu 64(%esp), %xmm7 + vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7 + vmovdqu %xmm7, 64(%esp) + vmovdqa (%ebp), %xmm7 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqa 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 184(%esp) + vmovdqa 160(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 184(%esp) + vmovdqa 192(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqa 224(%ebp), %xmm7 +L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vmovdqu (%ecx), %xmm4 + vmovdqu 16(%ecx), %xmm5 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vmovdqu %xmm4, (%ecx) + vmovdqu %xmm5, 16(%ecx) + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ecx), %xmm4 + vmovdqu 48(%ecx), %xmm5 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm5, %xmm3, %xmm3 + vmovdqu %xmm4, 32(%ecx) + vmovdqu %xmm5, 48(%ecx) + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # ghash encrypted counter + vmovdqu 80(%esp), %xmm2 + vmovdqu 48(%esp), %xmm7 + vmovdqu (%ecx), %xmm0 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm3 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm2 + vpclmulqdq $0x00, %xmm5, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vmovdqu 32(%esp), %xmm7 + vmovdqu 16(%ecx), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu 16(%esp), %xmm7 + vmovdqu 32(%ecx), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu (%esp), %xmm7 + vmovdqu 48(%ecx), %xmm0 + vpshufd $0x4e, %xmm7, %xmm4 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpshufd $0x4e, %xmm0, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm7, %xmm0, %xmm6 + vpclmulqdq $0x00, %xmm7, %xmm0, %xmm7 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm4 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm6, %xmm3, %xmm3 + vpxor %xmm4, %xmm1, %xmm1 + vpslldq $8, %xmm1, %xmm5 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm1, %xmm3, %xmm3 + vpslld $31, %xmm2, %xmm7 + vpslld $30, %xmm2, %xmm4 + vpslld $25, %xmm2, %xmm5 + vpxor %xmm4, %xmm7, %xmm7 + vpxor %xmm5, %xmm7, %xmm7 + vpsrldq $4, %xmm7, %xmm4 + vpslldq $12, %xmm7, %xmm7 + vpxor %xmm7, %xmm2, %xmm2 + vpsrld $0x01, %xmm2, %xmm5 + vpsrld $2, %xmm2, %xmm1 + vpsrld $7, %xmm2, %xmm0 + vpxor %xmm1, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm3, %xmm2, %xmm2 + vmovdqu %xmm2, 80(%esp) + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_avx1_ghash_64 +L_AES_GCM_decrypt_update_avx1_ghash_64_done: + vmovdqa %xmm2, %xmm6 + vmovdqu (%esp), %xmm5 +L_AES_GCM_decrypt_update_avx1_done_64: + movl 196(%esp), %edx + cmpl %edx, %ebx + jge L_AES_GCM_decrypt_update_avx1_done_dec + movl 196(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_decrypt_update_avx1_last_block_done +L_AES_GCM_decrypt_update_avx1_last_block_start: + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + vmovdqu (%ecx), %xmm1 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1 + vpxor %xmm6, %xmm1, %xmm1 + vmovdqu %xmm1, (%esp) + vmovdqu 64(%esp), %xmm1 + vmovdqu (%esp), %xmm3 + vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0 + vpaddd L_aes_gcm_avx1_one, %xmm1, %xmm1 + vmovdqu %xmm1, 64(%esp) + vpxor (%ebp), %xmm0, %xmm0 + vpclmulqdq $16, %xmm5, %xmm3, %xmm4 + vaesenc 16(%ebp), %xmm0, %xmm0 + vaesenc 32(%ebp), %xmm0, %xmm0 + vpclmulqdq $0x01, %xmm5, %xmm3, %xmm7 + vaesenc 48(%ebp), %xmm0, %xmm0 + vaesenc 64(%ebp), %xmm0, %xmm0 + vaesenc 80(%ebp), %xmm0, %xmm0 + vpclmulqdq $0x11, %xmm5, %xmm3, %xmm1 + vaesenc 96(%ebp), %xmm0, %xmm0 + vpxor %xmm7, %xmm4, %xmm4 + vpslldq $8, %xmm4, %xmm2 + vpsrldq $8, %xmm4, %xmm4 + vaesenc 112(%ebp), %xmm0, %xmm0 + vpclmulqdq $0x00, %xmm5, %xmm3, %xmm7 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqa L_aes_gcm_avx1_mod2_128, %xmm3 + vpclmulqdq $16, %xmm3, %xmm2, %xmm7 + vaesenc 128(%ebp), %xmm0, %xmm0 + vpshufd $0x4e, %xmm2, %xmm4 + vpxor %xmm7, %xmm4, %xmm4 + vpclmulqdq $16, %xmm3, %xmm4, %xmm7 + vaesenc 144(%ebp), %xmm0, %xmm0 + vpshufd $0x4e, %xmm4, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + vpxor %xmm1, %xmm6, %xmm6 + cmpl $11, 184(%esp) + vmovdqa 160(%ebp), %xmm1 + jl L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 176(%ebp), %xmm0, %xmm0 + cmpl $13, 184(%esp) + vmovdqa 192(%ebp), %xmm1 + jl L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last + vaesenc %xmm1, %xmm0, %xmm0 + vaesenc 208(%ebp), %xmm0, %xmm0 + vmovdqa 224(%ebp), %xmm1 +L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last: + vaesenclast %xmm1, %xmm0, %xmm0 + vmovdqu (%ecx), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%edx) + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_avx1_last_block_start +L_AES_GCM_decrypt_update_avx1_last_block_done: +L_AES_GCM_decrypt_update_avx1_done_dec: + movl 200(%esp), %esi + movl 208(%esp), %edi + vmovdqu 64(%esp), %xmm4 + vmovdqa %xmm6, (%esi) + vmovdqu %xmm4, (%edi) + addl $0xa0, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_update_avx1,.-AES_GCM_decrypt_update_avx1 +.text +.globl AES_GCM_decrypt_final_avx1 +.type AES_GCM_decrypt_final_avx1,@function +.align 16 +AES_GCM_decrypt_final_avx1: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $16, %esp + movl 36(%esp), %ebp + movl 56(%esp), %esi + movl 60(%esp), %edi + vmovdqa (%ebp), %xmm6 + vmovdqa (%esi), %xmm5 + vmovdqa (%edi), %xmm7 + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + movl 48(%esp), %edx + movl 52(%esp), %ecx + shll $3, %edx + shll $3, %ecx + vpinsrd $0x00, %edx, %xmm0, %xmm0 + vpinsrd $2, %ecx, %xmm0, %xmm0 + movl 48(%esp), %edx + movl 52(%esp), %ecx + shrl $29, %edx + shrl $29, %ecx + vpinsrd $0x01, %edx, %xmm0, %xmm0 + vpinsrd $3, %ecx, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_red_avx + vpshufd $0x4e, %xmm5, %xmm1 + vpshufd $0x4e, %xmm6, %xmm2 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm6, %xmm2, %xmm2 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpslldq $8, %xmm1, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm1, %xmm3, %xmm6 + vpslld $31, %xmm0, %xmm1 + vpslld $30, %xmm0, %xmm2 + vpslld $25, %xmm0, %xmm3 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm3, %xmm1, %xmm1 + vpsrldq $4, %xmm1, %xmm3 + vpslldq $12, %xmm1, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpsrld $0x01, %xmm0, %xmm1 + vpsrld $2, %xmm0, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpsrld $7, %xmm0, %xmm0 + vpxor %xmm3, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm0 + movl 40(%esp), %esi + movl 64(%esp), %edi + cmpl $16, 44(%esp) + je L_AES_GCM_decrypt_final_avx1_cmp_tag_16 + subl $16, %esp + xorl %ecx, %ecx + xorl %ebx, %ebx + vmovdqu %xmm0, (%esp) +L_AES_GCM_decrypt_final_avx1_cmp_tag_loop: + movzbl (%esp,%ecx,1), %eax + xorb (%esi,%ecx,1), %al + orb %al, %bl + incl %ecx + cmpl 44(%esp), %ecx + jne L_AES_GCM_decrypt_final_avx1_cmp_tag_loop + cmpb $0x00, %bl + sete %bl + addl $16, %esp + xorl %ecx, %ecx + jmp L_AES_GCM_decrypt_final_avx1_cmp_tag_done +L_AES_GCM_decrypt_final_avx1_cmp_tag_16: + vmovdqu (%esi), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %edx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ebx, %ebx + cmpl $0xffff, %edx + sete %bl +L_AES_GCM_decrypt_final_avx1_cmp_tag_done: + movl %ebx, (%edi) + addl $16, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_final_avx1,.-AES_GCM_decrypt_final_avx1 +#endif /* WOLFSSL_AESGCM_STREAM */ +#endif /* HAVE_INTEL_AVX1 */ +#ifdef HAVE_INTEL_AVX2 +.text +.globl AES_GCM_encrypt_avx2 +.type AES_GCM_encrypt_avx2,@function +.align 16 +AES_GCM_encrypt_avx2: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0x70, %esp + movl 144(%esp), %esi + movl 168(%esp), %ebp + movl 160(%esp), %edx + vpxor %xmm4, %xmm4, %xmm4 + cmpl $12, %edx + je L_AES_GCM_encrypt_avx2_iv_12 + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqu (%ebp), %xmm5 + vaesenc 16(%ebp), %xmm5, %xmm5 + vaesenc 32(%ebp), %xmm5, %xmm5 + vaesenc 48(%ebp), %xmm5, %xmm5 + vaesenc 64(%ebp), %xmm5, %xmm5 + vaesenc 80(%ebp), %xmm5, %xmm5 + vaesenc 96(%ebp), %xmm5, %xmm5 + vaesenc 112(%ebp), %xmm5, %xmm5 + vaesenc 128(%ebp), %xmm5, %xmm5 + vaesenc 144(%ebp), %xmm5, %xmm5 + cmpl $11, 172(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc 176(%ebp), %xmm5, %xmm5 + cmpl $13, 172(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc 208(%ebp), %xmm5, %xmm5 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm0, %xmm5, %xmm5 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_encrypt_avx2_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_encrypt_avx2_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_avx2_calc_iv_16_loop: + vmovdqu (%esi,%ecx,1), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx2_calc_iv_16_loop + movl 160(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_avx2_calc_iv_done +L_AES_GCM_encrypt_avx2_calc_iv_lt16: + vpxor %xmm0, %xmm0, %xmm0 + xorl %ebx, %ebx + vmovdqu %xmm0, (%esp) +L_AES_GCM_encrypt_avx2_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx2_calc_iv_loop + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 +L_AES_GCM_encrypt_avx2_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vpinsrd $0x00, %edx, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4 + # Encrypt counter + vmovdqu (%ebp), %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vaesenc 16(%ebp), %xmm6, %xmm6 + vaesenc 32(%ebp), %xmm6, %xmm6 + vaesenc 48(%ebp), %xmm6, %xmm6 + vaesenc 64(%ebp), %xmm6, %xmm6 + vaesenc 80(%ebp), %xmm6, %xmm6 + vaesenc 96(%ebp), %xmm6, %xmm6 + vaesenc 112(%ebp), %xmm6, %xmm6 + vaesenc 128(%ebp), %xmm6, %xmm6 + vaesenc 144(%ebp), %xmm6, %xmm6 + cmpl $11, 172(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc %xmm0, %xmm6, %xmm6 + vaesenc 176(%ebp), %xmm6, %xmm6 + cmpl $13, 172(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc %xmm0, %xmm6, %xmm6 + vaesenc 208(%ebp), %xmm6, %xmm6 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm0, %xmm6, %xmm6 + jmp L_AES_GCM_encrypt_avx2_iv_done +L_AES_GCM_encrypt_avx2_iv_12: + # # Calculate values when IV is 12 bytes + # Set counter based on IV + vmovdqu L_avx2_aes_gcm_bswap_one, %xmm4 + vmovdqu (%ebp), %xmm5 + vpblendd $7, (%esi), %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqu 16(%ebp), %xmm7 + vpxor %xmm5, %xmm4, %xmm6 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm6, %xmm6 + vmovdqu 32(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 48(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 64(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 80(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 96(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 112(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 128(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 144(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + cmpl $11, 172(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_encrypt_avx2_calc_iv_12_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 176(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + cmpl $13, 172(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_encrypt_avx2_calc_iv_12_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 208(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_encrypt_avx2_calc_iv_12_last: + vaesenclast %xmm0, %xmm5, %xmm5 + vaesenclast %xmm0, %xmm6, %xmm6 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5 +L_AES_GCM_encrypt_avx2_iv_done: + vmovdqu %xmm6, 80(%esp) + vpxor %xmm6, %xmm6, %xmm6 + movl 140(%esp), %esi + # Additional authentication data + movl 156(%esp), %edx + cmpl $0x00, %edx + je L_AES_GCM_encrypt_avx2_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_encrypt_avx2_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_encrypt_avx2_calc_aad_16_loop: + vmovdqu (%esi,%ecx,1), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm6, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm6, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm6 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx2_calc_aad_16_loop + movl 156(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_encrypt_avx2_calc_aad_done +L_AES_GCM_encrypt_avx2_calc_aad_lt16: + vpxor %xmm0, %xmm0, %xmm0 + xorl %ebx, %ebx + vmovdqu %xmm0, (%esp) +L_AES_GCM_encrypt_avx2_calc_aad_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_encrypt_avx2_calc_aad_loop + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm6, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm6, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm6 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 +L_AES_GCM_encrypt_avx2_calc_aad_done: + movl 132(%esp), %esi + movl 136(%esp), %edi + # Calculate counter and H + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4 + vpand L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm4 + vpxor %xmm0, %xmm5, %xmm5 + xorl %ebx, %ebx + cmpl $0x40, 152(%esp) + movl 152(%esp), %eax + jl L_AES_GCM_encrypt_avx2_done_64 + andl $0xffffffc0, %eax + vmovdqu %xmm4, 64(%esp) + vmovdqu %xmm6, 96(%esp) + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm3 + # H ^ 1 + vmovdqu %xmm5, (%esp) + vmovdqu %xmm5, %xmm2 + # H ^ 2 + vpclmulqdq $0x00, %xmm2, %xmm2, %xmm5 + vpclmulqdq $0x11, %xmm2, %xmm2, %xmm6 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm0 + vmovdqu %xmm0, 16(%esp) + # H ^ 3 + # ghash_gfmul_red + vpclmulqdq $16, %xmm0, %xmm2, %xmm6 + vpclmulqdq $0x01, %xmm0, %xmm2, %xmm5 + vpclmulqdq $0x00, %xmm0, %xmm2, %xmm4 + vpxor %xmm5, %xmm6, %xmm6 + vpslldq $8, %xmm6, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm0, %xmm2, %xmm1 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm1, 32(%esp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm5 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm6 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm2 + vmovdqu %xmm2, 48(%esp) + vmovdqu 96(%esp), %xmm6 + # First 64 bytes of input + # aesenc_64 + # aesenc_ctr + vmovdqu 64(%esp), %xmm4 + vmovdqu L_aes_gcm_avx2_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm1 + vpshufb %xmm7, %xmm4, %xmm0 + vpaddd L_aes_gcm_avx2_two, %xmm4, %xmm2 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx2_three, %xmm4, %xmm3 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx2_four, %xmm4, %xmm4 + vpshufb %xmm7, %xmm3, %xmm3 + # aesenc_xor + vmovdqu (%ebp), %xmm7 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqu 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 172(%esp) + vmovdqu 160(%ebp), %xmm7 + jl L_AES_GCM_encrypt_avx2_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 172(%esp) + vmovdqu 192(%ebp), %xmm7 + jl L_AES_GCM_encrypt_avx2_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 224(%ebp), %xmm7 +L_AES_GCM_encrypt_avx2_aesenc_64_enc_done: + # aesenc_last + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu (%esi), %xmm7 + vmovdqu 16(%esi), %xmm4 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm0, (%edi) + vmovdqu %xmm1, 16(%edi) + vmovdqu 32(%esi), %xmm7 + vmovdqu 48(%esi), %xmm4 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu %xmm2, 32(%edi) + vmovdqu %xmm3, 48(%edi) + cmpl $0x40, %eax + movl $0x40, %ebx + movl %esi, %ecx + movl %edi, %edx + jle L_AES_GCM_encrypt_avx2_end_64 + # More 64 bytes of input +L_AES_GCM_encrypt_avx2_ghash_64: + # aesenc_64_ghash + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # aesenc_64 + # aesenc_ctr + vmovdqu 64(%esp), %xmm4 + vmovdqu L_aes_gcm_avx2_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm1 + vpshufb %xmm7, %xmm4, %xmm0 + vpaddd L_aes_gcm_avx2_two, %xmm4, %xmm2 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx2_three, %xmm4, %xmm3 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx2_four, %xmm4, %xmm4 + vpshufb %xmm7, %xmm3, %xmm3 + # aesenc_xor + vmovdqu (%ebp), %xmm7 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqu 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 172(%esp) + vmovdqu 160(%ebp), %xmm7 + jl L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 172(%esp) + vmovdqu 192(%ebp), %xmm7 + jl L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 224(%ebp), %xmm7 +L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done: + # aesenc_last + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu (%ecx), %xmm7 + vmovdqu 16(%ecx), %xmm4 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vmovdqu 32(%ecx), %xmm7 + vmovdqu 48(%ecx), %xmm4 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # pclmul_1 + vmovdqu -64(%edx), %xmm1 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vmovdqu 48(%esp), %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm5 + vpclmulqdq $0x01, %xmm2, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + # pclmul_2 + vmovdqu -48(%edx), %xmm1 + vmovdqu 32(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu -32(%edx), %xmm1 + vmovdqu 16(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu -16(%edx), %xmm1 + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # aesenc_pclmul_l + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm3, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm0 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + # aesenc_64_ghash - end + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_avx2_ghash_64 +L_AES_GCM_encrypt_avx2_end_64: + vmovdqu %xmm6, 96(%esp) + vmovdqu 48(%edx), %xmm3 + vmovdqu (%esp), %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3 + vpclmulqdq $16, %xmm3, %xmm7, %xmm5 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm1 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm4 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm6 + vpxor %xmm1, %xmm5, %xmm5 + vmovdqu 32(%edx), %xmm3 + vmovdqu 16(%esp), %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3 + vpclmulqdq $16, %xmm3, %xmm7, %xmm2 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm1 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm0 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm0, %xmm4, %xmm4 + vmovdqu 16(%edx), %xmm3 + vmovdqu 32(%esp), %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3 + vpclmulqdq $16, %xmm3, %xmm7, %xmm2 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm1 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm0 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm0, %xmm4, %xmm4 + vmovdqu 96(%esp), %xmm0 + vmovdqu (%edx), %xmm3 + vmovdqu 48(%esp), %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3 + vpxor %xmm0, %xmm3, %xmm3 + vpclmulqdq $16, %xmm3, %xmm7, %xmm2 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm1 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm0 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm0, %xmm4, %xmm4 + vpslldq $8, %xmm5, %xmm7 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm7, %xmm4, %xmm4 + vpxor %xmm5, %xmm6, %xmm6 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm4, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 + vmovdqu (%esp), %xmm5 + vmovdqu 64(%esp), %xmm4 +L_AES_GCM_encrypt_avx2_done_64: + cmpl 152(%esp), %ebx + je L_AES_GCM_encrypt_avx2_done_enc + movl 152(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_avx2_last_block_done + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # aesenc_block + vmovdqu %xmm4, %xmm1 + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0 + vpaddd L_aes_gcm_avx2_one, %xmm1, %xmm1 + vpxor (%ebp), %xmm0, %xmm0 + vaesenc 16(%ebp), %xmm0, %xmm0 + vaesenc 32(%ebp), %xmm0, %xmm0 + vaesenc 48(%ebp), %xmm0, %xmm0 + vaesenc 64(%ebp), %xmm0, %xmm0 + vaesenc 80(%ebp), %xmm0, %xmm0 + vaesenc 96(%ebp), %xmm0, %xmm0 + vaesenc 112(%ebp), %xmm0, %xmm0 + vaesenc 128(%ebp), %xmm0, %xmm0 + vaesenc 144(%ebp), %xmm0, %xmm0 + cmpl $11, 172(%esp) + vmovdqu 160(%ebp), %xmm2 + jl L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last + vaesenc %xmm2, %xmm0, %xmm0 + vaesenc 176(%ebp), %xmm0, %xmm0 + cmpl $13, 172(%esp) + vmovdqu 192(%ebp), %xmm2 + jl L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last + vaesenc %xmm2, %xmm0, %xmm0 + vaesenc 208(%ebp), %xmm0, %xmm0 + vmovdqu 224(%ebp), %xmm2 +L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last: + vaesenclast %xmm2, %xmm0, %xmm0 + vmovdqu %xmm1, %xmm4 + vmovdqu (%ecx), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%edx) + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + addl $16, %ebx + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_avx2_last_block_ghash +L_AES_GCM_encrypt_avx2_last_block_start: + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm4 + vmovdqu %xmm4, 64(%esp) + # aesenc_gfmul_sb + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm2 + vpclmulqdq $16, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm1 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm4 + vpxor (%ebp), %xmm7, %xmm7 + vaesenc 16(%ebp), %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpslldq $8, %xmm3, %xmm2 + vpsrldq $8, %xmm3, %xmm3 + vaesenc 32(%ebp), %xmm7, %xmm7 + vpxor %xmm1, %xmm2, %xmm2 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1 + vaesenc 48(%ebp), %xmm7, %xmm7 + vaesenc 64(%ebp), %xmm7, %xmm7 + vaesenc 80(%ebp), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1 + vaesenc 96(%ebp), %xmm7, %xmm7 + vaesenc 112(%ebp), %xmm7, %xmm7 + vaesenc 128(%ebp), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vaesenc 144(%ebp), %xmm7, %xmm7 + vpxor %xmm3, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + vmovdqu 160(%ebp), %xmm0 + cmpl $11, 172(%esp) + jl L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 176(%ebp), %xmm7, %xmm7 + vmovdqu 192(%ebp), %xmm0 + cmpl $13, 172(%esp) + jl L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 208(%ebp), %xmm7, %xmm7 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last: + vaesenclast %xmm0, %xmm7, %xmm7 + vmovdqu (%esi,%ebx,1), %xmm3 + vpxor %xmm1, %xmm2, %xmm6 + vpxor %xmm3, %xmm7, %xmm7 + vmovdqu %xmm7, (%edi,%ebx,1) + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm7, %xmm7 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu 64(%esp), %xmm4 + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_avx2_last_block_start +L_AES_GCM_encrypt_avx2_last_block_ghash: + # ghash_gfmul_red + vpclmulqdq $16, %xmm5, %xmm6, %xmm2 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm1 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm6 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm0, %xmm6, %xmm6 +L_AES_GCM_encrypt_avx2_last_block_done: + movl 152(%esp), %ecx + movl 152(%esp), %edx + andl $15, %ecx + jz L_AES_GCM_encrypt_avx2_done_enc + # aesenc_last15_enc + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4 + vpxor (%ebp), %xmm4, %xmm4 + vaesenc 16(%ebp), %xmm4, %xmm4 + vaesenc 32(%ebp), %xmm4, %xmm4 + vaesenc 48(%ebp), %xmm4, %xmm4 + vaesenc 64(%ebp), %xmm4, %xmm4 + vaesenc 80(%ebp), %xmm4, %xmm4 + vaesenc 96(%ebp), %xmm4, %xmm4 + vaesenc 112(%ebp), %xmm4, %xmm4 + vaesenc 128(%ebp), %xmm4, %xmm4 + vaesenc 144(%ebp), %xmm4, %xmm4 + cmpl $11, 172(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm0, %xmm4, %xmm4 + vaesenc 176(%ebp), %xmm4, %xmm4 + cmpl $13, 172(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc %xmm0, %xmm4, %xmm4 + vaesenc 208(%ebp), %xmm4, %xmm4 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast %xmm0, %xmm4, %xmm4 + xorl %ecx, %ecx + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %xmm4, (%esp) + vmovdqu %xmm0, 16(%esp) +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop: + movzbl (%esi,%ebx,1), %eax + xorb (%esp,%ecx,1), %al + movb %al, 16(%esp,%ecx,1) + movb %al, (%edi,%ebx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc: + vmovdqu 16(%esp), %xmm4 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm6, %xmm6 + # ghash_gfmul_red + vpclmulqdq $16, %xmm5, %xmm6, %xmm2 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm1 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm6 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm0, %xmm6, %xmm6 +L_AES_GCM_encrypt_avx2_done_enc: + vmovdqu 80(%esp), %xmm7 + # calc_tag + movl 152(%esp), %ecx + shll $3, %ecx + vpinsrd $0x00, %ecx, %xmm0, %xmm0 + movl 156(%esp), %ecx + shll $3, %ecx + vpinsrd $2, %ecx, %xmm0, %xmm0 + movl 152(%esp), %ecx + shrl $29, %ecx + vpinsrd $0x01, %ecx, %xmm0, %xmm0 + movl 156(%esp), %ecx + shrl $29, %ecx + vpinsrd $3, %ecx, %xmm0, %xmm0 + vpxor %xmm6, %xmm0, %xmm0 + # ghash_gfmul_red + vpclmulqdq $16, %xmm5, %xmm0, %xmm4 + vpclmulqdq $0x01, %xmm5, %xmm0, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm2 + vpxor %xmm3, %xmm4, %xmm4 + vpslldq $8, %xmm4, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpxor %xmm2, %xmm3, %xmm3 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm0 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm3, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + movl 148(%esp), %edi + movl 164(%esp), %ebx + # store_tag + cmpl $16, %ebx + je L_AES_GCM_encrypt_avx2_store_tag_16 + xorl %ecx, %ecx + vmovdqu %xmm0, (%esp) +L_AES_GCM_encrypt_avx2_store_tag_loop: + movzbl (%esp,%ecx,1), %eax + movb %al, (%edi,%ecx,1) + incl %ecx + cmpl %ebx, %ecx + jne L_AES_GCM_encrypt_avx2_store_tag_loop + jmp L_AES_GCM_encrypt_avx2_store_tag_done +L_AES_GCM_encrypt_avx2_store_tag_16: + vmovdqu %xmm0, (%edi) +L_AES_GCM_encrypt_avx2_store_tag_done: + addl $0x70, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_encrypt_avx2,.-AES_GCM_encrypt_avx2 +.text +.globl AES_GCM_decrypt_avx2 +.type AES_GCM_decrypt_avx2,@function +.align 16 +AES_GCM_decrypt_avx2: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0xb0, %esp + movl 208(%esp), %esi + movl 232(%esp), %ebp + vpxor %xmm4, %xmm4, %xmm4 + movl 224(%esp), %edx + cmpl $12, %edx + je L_AES_GCM_decrypt_avx2_iv_12 + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqu (%ebp), %xmm5 + vaesenc 16(%ebp), %xmm5, %xmm5 + vaesenc 32(%ebp), %xmm5, %xmm5 + vaesenc 48(%ebp), %xmm5, %xmm5 + vaesenc 64(%ebp), %xmm5, %xmm5 + vaesenc 80(%ebp), %xmm5, %xmm5 + vaesenc 96(%ebp), %xmm5, %xmm5 + vaesenc 112(%ebp), %xmm5, %xmm5 + vaesenc 128(%ebp), %xmm5, %xmm5 + vaesenc 144(%ebp), %xmm5, %xmm5 + cmpl $11, 236(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc 176(%ebp), %xmm5, %xmm5 + cmpl $13, 236(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc 208(%ebp), %xmm5, %xmm5 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm0, %xmm5, %xmm5 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_decrypt_avx2_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_decrypt_avx2_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_avx2_calc_iv_16_loop: + vmovdqu (%esi,%ecx,1), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx2_calc_iv_16_loop + movl 224(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_avx2_calc_iv_done +L_AES_GCM_decrypt_avx2_calc_iv_lt16: + vpxor %xmm0, %xmm0, %xmm0 + xorl %ebx, %ebx + vmovdqu %xmm0, (%esp) +L_AES_GCM_decrypt_avx2_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx2_calc_iv_loop + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 +L_AES_GCM_decrypt_avx2_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vpinsrd $0x00, %edx, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4 + # Encrypt counter + vmovdqu (%ebp), %xmm6 + vpxor %xmm4, %xmm6, %xmm6 + vaesenc 16(%ebp), %xmm6, %xmm6 + vaesenc 32(%ebp), %xmm6, %xmm6 + vaesenc 48(%ebp), %xmm6, %xmm6 + vaesenc 64(%ebp), %xmm6, %xmm6 + vaesenc 80(%ebp), %xmm6, %xmm6 + vaesenc 96(%ebp), %xmm6, %xmm6 + vaesenc 112(%ebp), %xmm6, %xmm6 + vaesenc 128(%ebp), %xmm6, %xmm6 + vaesenc 144(%ebp), %xmm6, %xmm6 + cmpl $11, 236(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc %xmm0, %xmm6, %xmm6 + vaesenc 176(%ebp), %xmm6, %xmm6 + cmpl $13, 236(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc %xmm0, %xmm6, %xmm6 + vaesenc 208(%ebp), %xmm6, %xmm6 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm0, %xmm6, %xmm6 + jmp L_AES_GCM_decrypt_avx2_iv_done +L_AES_GCM_decrypt_avx2_iv_12: + # # Calculate values when IV is 12 bytes + # Set counter based on IV + vmovdqu L_avx2_aes_gcm_bswap_one, %xmm4 + vmovdqu (%ebp), %xmm5 + vpblendd $7, (%esi), %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqu 16(%ebp), %xmm7 + vpxor %xmm5, %xmm4, %xmm6 + vaesenc %xmm7, %xmm5, %xmm5 + vaesenc %xmm7, %xmm6, %xmm6 + vmovdqu 32(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 48(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 64(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 80(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 96(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 112(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 128(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 144(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + cmpl $11, 236(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_decrypt_avx2_calc_iv_12_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 176(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + cmpl $13, 236(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_decrypt_avx2_calc_iv_12_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 208(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm6, %xmm6 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_decrypt_avx2_calc_iv_12_last: + vaesenclast %xmm0, %xmm5, %xmm5 + vaesenclast %xmm0, %xmm6, %xmm6 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5 +L_AES_GCM_decrypt_avx2_iv_done: + vmovdqu %xmm6, 80(%esp) + vpxor %xmm6, %xmm6, %xmm6 + movl 204(%esp), %esi + # Additional authentication data + movl 220(%esp), %edx + cmpl $0x00, %edx + je L_AES_GCM_decrypt_avx2_calc_aad_done + xorl %ecx, %ecx + cmpl $16, %edx + jl L_AES_GCM_decrypt_avx2_calc_aad_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_decrypt_avx2_calc_aad_16_loop: + vmovdqu (%esi,%ecx,1), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm6, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm6, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm6 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx2_calc_aad_16_loop + movl 220(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_decrypt_avx2_calc_aad_done +L_AES_GCM_decrypt_avx2_calc_aad_lt16: + vpxor %xmm0, %xmm0, %xmm0 + xorl %ebx, %ebx + vmovdqu %xmm0, (%esp) +L_AES_GCM_decrypt_avx2_calc_aad_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_decrypt_avx2_calc_aad_loop + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm6, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm6, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm6, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm6, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm7 + vpxor %xmm2, %xmm3, %xmm6 + # ghash_mid + vpsrld $31, %xmm7, %xmm0 + vpsrld $31, %xmm6, %xmm1 + vpslld $0x01, %xmm7, %xmm7 + vpslld $0x01, %xmm6, %xmm6 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + vpor %xmm0, %xmm7, %xmm7 + vpor %xmm1, %xmm6, %xmm6 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm7, %xmm0 + vpshufd $0x4e, %xmm7, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 +L_AES_GCM_decrypt_avx2_calc_aad_done: + movl 196(%esp), %esi + movl 200(%esp), %edi + # Calculate counter and H + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4 + vpand L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm4 + vpxor %xmm0, %xmm5, %xmm5 + xorl %ebx, %ebx + cmpl $0x40, 216(%esp) + movl 216(%esp), %eax + jl L_AES_GCM_decrypt_avx2_done_64 + andl $0xffffffc0, %eax + vmovdqu %xmm4, 64(%esp) + vmovdqu %xmm6, 96(%esp) + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm3 + # H ^ 1 + vmovdqu %xmm5, (%esp) + vmovdqu %xmm5, %xmm2 + # H ^ 2 + vpclmulqdq $0x00, %xmm2, %xmm2, %xmm5 + vpclmulqdq $0x11, %xmm2, %xmm2, %xmm6 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm0 + vmovdqu %xmm0, 16(%esp) + # H ^ 3 + # ghash_gfmul_red + vpclmulqdq $16, %xmm0, %xmm2, %xmm6 + vpclmulqdq $0x01, %xmm0, %xmm2, %xmm5 + vpclmulqdq $0x00, %xmm0, %xmm2, %xmm4 + vpxor %xmm5, %xmm6, %xmm6 + vpslldq $8, %xmm6, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm0, %xmm2, %xmm1 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm1, 32(%esp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm5 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm6 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm2 + vmovdqu %xmm2, 48(%esp) + vmovdqu 96(%esp), %xmm6 + cmpl %esi, %edi + jne L_AES_GCM_decrypt_avx2_ghash_64 +L_AES_GCM_decrypt_avx2_ghash_64_inplace: + # aesenc_64_ghash + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # aesenc_64 + # aesenc_ctr + vmovdqu 64(%esp), %xmm4 + vmovdqu L_aes_gcm_avx2_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm1 + vpshufb %xmm7, %xmm4, %xmm0 + vpaddd L_aes_gcm_avx2_two, %xmm4, %xmm2 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx2_three, %xmm4, %xmm3 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx2_four, %xmm4, %xmm4 + vpshufb %xmm7, %xmm3, %xmm3 + # aesenc_xor + vmovdqu (%ebp), %xmm7 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqu 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 236(%esp) + vmovdqu 160(%ebp), %xmm7 + jl L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 236(%esp) + vmovdqu 192(%ebp), %xmm7 + jl L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 224(%ebp), %xmm7 +L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done: + # aesenc_last + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu (%ecx), %xmm7 + vmovdqu 16(%ecx), %xmm4 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm7, 112(%esp) + vmovdqu %xmm4, 128(%esp) + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vmovdqu 32(%ecx), %xmm7 + vmovdqu 48(%ecx), %xmm4 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu %xmm7, 144(%esp) + vmovdqu %xmm4, 160(%esp) + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # pclmul_1 + vmovdqu 112(%esp), %xmm1 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vmovdqu 48(%esp), %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm5 + vpclmulqdq $0x01, %xmm2, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + # pclmul_2 + vmovdqu 128(%esp), %xmm1 + vmovdqu 32(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu 144(%esp), %xmm1 + vmovdqu 16(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu 160(%esp), %xmm1 + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # aesenc_pclmul_l + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm3, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm0 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + # aesenc_64_ghash - end + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_avx2_ghash_64_inplace + jmp L_AES_GCM_decrypt_avx2_ghash_64_done +L_AES_GCM_decrypt_avx2_ghash_64: + # aesenc_64_ghash + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # aesenc_64 + # aesenc_ctr + vmovdqu 64(%esp), %xmm4 + vmovdqu L_aes_gcm_avx2_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm1 + vpshufb %xmm7, %xmm4, %xmm0 + vpaddd L_aes_gcm_avx2_two, %xmm4, %xmm2 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx2_three, %xmm4, %xmm3 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx2_four, %xmm4, %xmm4 + vpshufb %xmm7, %xmm3, %xmm3 + # aesenc_xor + vmovdqu (%ebp), %xmm7 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqu 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 236(%esp) + vmovdqu 160(%ebp), %xmm7 + jl L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 236(%esp) + vmovdqu 192(%ebp), %xmm7 + jl L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 224(%ebp), %xmm7 +L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done: + # aesenc_last + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu (%ecx), %xmm7 + vmovdqu 16(%ecx), %xmm4 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm7, (%ecx) + vmovdqu %xmm4, 16(%ecx) + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vmovdqu 32(%ecx), %xmm7 + vmovdqu 48(%ecx), %xmm4 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu %xmm7, 32(%ecx) + vmovdqu %xmm4, 48(%ecx) + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # pclmul_1 + vmovdqu (%ecx), %xmm1 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vmovdqu 48(%esp), %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm5 + vpclmulqdq $0x01, %xmm2, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + # pclmul_2 + vmovdqu 16(%ecx), %xmm1 + vmovdqu 32(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu 32(%ecx), %xmm1 + vmovdqu 16(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu 48(%ecx), %xmm1 + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # aesenc_pclmul_l + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm3, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm0 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + # aesenc_64_ghash - end + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_avx2_ghash_64 +L_AES_GCM_decrypt_avx2_ghash_64_done: + vmovdqu (%esp), %xmm5 + vmovdqu 64(%esp), %xmm4 +L_AES_GCM_decrypt_avx2_done_64: + cmpl 216(%esp), %ebx + jge L_AES_GCM_decrypt_avx2_done_dec + movl 216(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_decrypt_avx2_last_block_done +L_AES_GCM_decrypt_avx2_last_block_start: + vmovdqu (%esi,%ebx,1), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm4 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm6, %xmm0, %xmm4 + # aesenc_gfmul_sb + vpclmulqdq $0x01, %xmm5, %xmm4, %xmm2 + vpclmulqdq $16, %xmm5, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm1 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm4 + vpxor (%ebp), %xmm7, %xmm7 + vaesenc 16(%ebp), %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpslldq $8, %xmm3, %xmm2 + vpsrldq $8, %xmm3, %xmm3 + vaesenc 32(%ebp), %xmm7, %xmm7 + vpxor %xmm1, %xmm2, %xmm2 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1 + vaesenc 48(%ebp), %xmm7, %xmm7 + vaesenc 64(%ebp), %xmm7, %xmm7 + vaesenc 80(%ebp), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1 + vaesenc 96(%ebp), %xmm7, %xmm7 + vaesenc 112(%ebp), %xmm7, %xmm7 + vaesenc 128(%ebp), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vaesenc 144(%ebp), %xmm7, %xmm7 + vpxor %xmm3, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + vmovdqu 160(%ebp), %xmm0 + cmpl $11, 236(%esp) + jl L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 176(%ebp), %xmm7, %xmm7 + vmovdqu 192(%ebp), %xmm0 + cmpl $13, 236(%esp) + jl L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 208(%ebp), %xmm7, %xmm7 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last: + vaesenclast %xmm0, %xmm7, %xmm7 + vmovdqu (%esi,%ebx,1), %xmm3 + vpxor %xmm1, %xmm2, %xmm6 + vpxor %xmm3, %xmm7, %xmm7 + vmovdqu %xmm7, (%edi,%ebx,1) + vmovdqu 64(%esp), %xmm4 + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_avx2_last_block_start +L_AES_GCM_decrypt_avx2_last_block_done: + movl 216(%esp), %ecx + movl 216(%esp), %edx + andl $15, %ecx + jz L_AES_GCM_decrypt_avx2_done_dec + # aesenc_last15_dec + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4 + vpxor (%ebp), %xmm4, %xmm4 + vaesenc 16(%ebp), %xmm4, %xmm4 + vaesenc 32(%ebp), %xmm4, %xmm4 + vaesenc 48(%ebp), %xmm4, %xmm4 + vaesenc 64(%ebp), %xmm4, %xmm4 + vaesenc 80(%ebp), %xmm4, %xmm4 + vaesenc 96(%ebp), %xmm4, %xmm4 + vaesenc 112(%ebp), %xmm4, %xmm4 + vaesenc 128(%ebp), %xmm4, %xmm4 + vaesenc 144(%ebp), %xmm4, %xmm4 + cmpl $11, 236(%esp) + vmovdqu 160(%ebp), %xmm1 + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm1, %xmm4, %xmm4 + vaesenc 176(%ebp), %xmm4, %xmm4 + cmpl $13, 236(%esp) + vmovdqu 192(%ebp), %xmm1 + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc %xmm1, %xmm4, %xmm4 + vaesenc 208(%ebp), %xmm4, %xmm4 + vmovdqu 224(%ebp), %xmm1 +L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast %xmm1, %xmm4, %xmm4 + xorl %ecx, %ecx + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %xmm4, (%esp) + vmovdqu %xmm0, 16(%esp) +L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop: + movzbl (%esi,%ebx,1), %eax + movb %al, 16(%esp,%ecx,1) + xorb (%esp,%ecx,1), %al + movb %al, (%edi,%ebx,1) + incl %ebx + incl %ecx + cmpl %edx, %ebx + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop + vmovdqu 16(%esp), %xmm4 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4 + vpxor %xmm4, %xmm6, %xmm6 + # ghash_gfmul_red + vpclmulqdq $16, %xmm5, %xmm6, %xmm2 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm1 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm6 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm0, %xmm6, %xmm6 +L_AES_GCM_decrypt_avx2_done_dec: + vmovdqu 80(%esp), %xmm7 + # calc_tag + movl 216(%esp), %ecx + shll $3, %ecx + vpinsrd $0x00, %ecx, %xmm0, %xmm0 + movl 220(%esp), %ecx + shll $3, %ecx + vpinsrd $2, %ecx, %xmm0, %xmm0 + movl 216(%esp), %ecx + shrl $29, %ecx + vpinsrd $0x01, %ecx, %xmm0, %xmm0 + movl 220(%esp), %ecx + shrl $29, %ecx + vpinsrd $3, %ecx, %xmm0, %xmm0 + vpxor %xmm6, %xmm0, %xmm0 + # ghash_gfmul_red + vpclmulqdq $16, %xmm5, %xmm0, %xmm4 + vpclmulqdq $0x01, %xmm5, %xmm0, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm2 + vpxor %xmm3, %xmm4, %xmm4 + vpslldq $8, %xmm4, %xmm3 + vpsrldq $8, %xmm4, %xmm4 + vpxor %xmm2, %xmm3, %xmm3 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm0 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm3, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm7, %xmm0, %xmm0 + movl 212(%esp), %edi + movl 228(%esp), %ebx + movl 240(%esp), %ebp + # cmp_tag + cmpl $16, %ebx + je L_AES_GCM_decrypt_avx2_cmp_tag_16 + xorl %edx, %edx + xorl %ecx, %ecx + vmovdqu %xmm0, (%esp) +L_AES_GCM_decrypt_avx2_cmp_tag_loop: + movzbl (%esp,%edx,1), %eax + xorb (%edi,%edx,1), %al + orb %al, %cl + incl %edx + cmpl %ebx, %edx + jne L_AES_GCM_decrypt_avx2_cmp_tag_loop + cmpb $0x00, %cl + sete %cl + jmp L_AES_GCM_decrypt_avx2_cmp_tag_done +L_AES_GCM_decrypt_avx2_cmp_tag_16: + vmovdqu (%edi), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %edx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %ecx, %ecx + cmpl $0xffff, %edx + sete %cl +L_AES_GCM_decrypt_avx2_cmp_tag_done: + movl %ecx, (%ebp) + addl $0xb0, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_avx2,.-AES_GCM_decrypt_avx2 +#ifdef WOLFSSL_AESGCM_STREAM +.text +.globl AES_GCM_init_avx2 +.type AES_GCM_init_avx2,@function +.align 16 +AES_GCM_init_avx2: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $32, %esp + movl 52(%esp), %ebp + movl 60(%esp), %esi + movl 76(%esp), %edi + vpxor %xmm4, %xmm4, %xmm4 + movl 64(%esp), %edx + cmpl $12, %edx + je L_AES_GCM_init_avx2_iv_12 + # Calculate values when IV is not 12 bytes + # H = Encrypt X(=0) + vmovdqu (%ebp), %xmm5 + vaesenc 16(%ebp), %xmm5, %xmm5 + vaesenc 32(%ebp), %xmm5, %xmm5 + vaesenc 48(%ebp), %xmm5, %xmm5 + vaesenc 64(%ebp), %xmm5, %xmm5 + vaesenc 80(%ebp), %xmm5, %xmm5 + vaesenc 96(%ebp), %xmm5, %xmm5 + vaesenc 112(%ebp), %xmm5, %xmm5 + vaesenc 128(%ebp), %xmm5, %xmm5 + vaesenc 144(%ebp), %xmm5, %xmm5 + cmpl $11, 56(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc 176(%ebp), %xmm5, %xmm5 + cmpl $13, 56(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc 208(%ebp), %xmm5, %xmm5 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast %xmm0, %xmm5, %xmm5 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5 + # Calc counter + # Initialization vector + cmpl $0x00, %edx + movl $0x00, %ecx + je L_AES_GCM_init_avx2_calc_iv_done + cmpl $16, %edx + jl L_AES_GCM_init_avx2_calc_iv_lt16 + andl $0xfffffff0, %edx +L_AES_GCM_init_avx2_calc_iv_16_loop: + vmovdqu (%esi,%ecx,1), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm6 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm6, %xmm0 + vpshufd $0x4e, %xmm6, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_init_avx2_calc_iv_16_loop + movl 64(%esp), %edx + cmpl %edx, %ecx + je L_AES_GCM_init_avx2_calc_iv_done +L_AES_GCM_init_avx2_calc_iv_lt16: + vpxor %xmm0, %xmm0, %xmm0 + xorl %ebx, %ebx + vmovdqu %xmm0, (%esp) +L_AES_GCM_init_avx2_calc_iv_loop: + movzbl (%esi,%ecx,1), %eax + movb %al, (%esp,%ebx,1) + incl %ecx + incl %ebx + cmpl %edx, %ecx + jl L_AES_GCM_init_avx2_calc_iv_loop + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm6 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm6, %xmm0 + vpshufd $0x4e, %xmm6, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 +L_AES_GCM_init_avx2_calc_iv_done: + # T = Encrypt counter + vpxor %xmm0, %xmm0, %xmm0 + shll $3, %edx + vpinsrd $0x00, %edx, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm6 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm6, %xmm0 + vpshufd $0x4e, %xmm6, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4 + # Encrypt counter + vmovdqu (%ebp), %xmm7 + vpxor %xmm4, %xmm7, %xmm7 + vaesenc 16(%ebp), %xmm7, %xmm7 + vaesenc 32(%ebp), %xmm7, %xmm7 + vaesenc 48(%ebp), %xmm7, %xmm7 + vaesenc 64(%ebp), %xmm7, %xmm7 + vaesenc 80(%ebp), %xmm7, %xmm7 + vaesenc 96(%ebp), %xmm7, %xmm7 + vaesenc 112(%ebp), %xmm7, %xmm7 + vaesenc 128(%ebp), %xmm7, %xmm7 + vaesenc 144(%ebp), %xmm7, %xmm7 + cmpl $11, 56(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 176(%ebp), %xmm7, %xmm7 + cmpl $13, 56(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 208(%ebp), %xmm7, %xmm7 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast %xmm0, %xmm7, %xmm7 + jmp L_AES_GCM_init_avx2_iv_done +L_AES_GCM_init_avx2_iv_12: + # # Calculate values when IV is 12 bytes + # Set counter based on IV + vmovdqu L_avx2_aes_gcm_bswap_one, %xmm4 + vmovdqu (%ebp), %xmm5 + vpblendd $7, (%esi), %xmm4, %xmm4 + # H = Encrypt X(=0) and T = Encrypt counter + vmovdqu 16(%ebp), %xmm6 + vpxor %xmm5, %xmm4, %xmm7 + vaesenc %xmm6, %xmm5, %xmm5 + vaesenc %xmm6, %xmm7, %xmm7 + vmovdqu 32(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 48(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 64(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 80(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 96(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 112(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 128(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 144(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + cmpl $11, 56(%esp) + vmovdqu 160(%ebp), %xmm0 + jl L_AES_GCM_init_avx2_calc_iv_12_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 176(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + cmpl $13, 56(%esp) + vmovdqu 192(%ebp), %xmm0 + jl L_AES_GCM_init_avx2_calc_iv_12_last + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 208(%ebp), %xmm0 + vaesenc %xmm0, %xmm5, %xmm5 + vaesenc %xmm0, %xmm7, %xmm7 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_init_avx2_calc_iv_12_last: + vaesenclast %xmm0, %xmm5, %xmm5 + vaesenclast %xmm0, %xmm7, %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5 +L_AES_GCM_init_avx2_iv_done: + vmovdqu %xmm7, (%edi) + movl 68(%esp), %ebp + movl 72(%esp), %edi + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm4 + vmovdqu %xmm5, (%ebp) + vmovdqu %xmm4, (%edi) + addl $32, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_init_avx2,.-AES_GCM_init_avx2 +.text +.globl AES_GCM_aad_update_avx2 +.type AES_GCM_aad_update_avx2,@function +.align 16 +AES_GCM_aad_update_avx2: + pushl %esi + pushl %edi + movl 12(%esp), %esi + movl 16(%esp), %edx + movl 20(%esp), %edi + movl 24(%esp), %eax + vmovdqu (%edi), %xmm4 + vmovdqu (%eax), %xmm5 + xorl %ecx, %ecx +L_AES_GCM_aad_update_avx2_16_loop: + vmovdqu (%esi,%ecx,1), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm6 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm6, %xmm0 + vpshufd $0x4e, %xmm6, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + addl $16, %ecx + cmpl %edx, %ecx + jl L_AES_GCM_aad_update_avx2_16_loop + vmovdqu %xmm4, (%edi) + popl %edi + popl %esi + ret +.size AES_GCM_aad_update_avx2,.-AES_GCM_aad_update_avx2 +.text +.globl AES_GCM_encrypt_block_avx2 +.type AES_GCM_encrypt_block_avx2,@function +.align 16 +AES_GCM_encrypt_block_avx2: + pushl %esi + pushl %edi + movl 12(%esp), %ecx + movl 16(%esp), %eax + movl 20(%esp), %edi + movl 24(%esp), %esi + movl 28(%esp), %edx + vmovdqu (%edx), %xmm3 + # aesenc_block + vmovdqu %xmm3, %xmm1 + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0 + vpaddd L_aes_gcm_avx2_one, %xmm1, %xmm1 + vpxor (%ecx), %xmm0, %xmm0 + vaesenc 16(%ecx), %xmm0, %xmm0 + vaesenc 32(%ecx), %xmm0, %xmm0 + vaesenc 48(%ecx), %xmm0, %xmm0 + vaesenc 64(%ecx), %xmm0, %xmm0 + vaesenc 80(%ecx), %xmm0, %xmm0 + vaesenc 96(%ecx), %xmm0, %xmm0 + vaesenc 112(%ecx), %xmm0, %xmm0 + vaesenc 128(%ecx), %xmm0, %xmm0 + vaesenc 144(%ecx), %xmm0, %xmm0 + cmpl $11, %eax + vmovdqu 160(%ecx), %xmm2 + jl L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last + vaesenc %xmm2, %xmm0, %xmm0 + vaesenc 176(%ecx), %xmm0, %xmm0 + cmpl $13, %eax + vmovdqu 192(%ecx), %xmm2 + jl L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last + vaesenc %xmm2, %xmm0, %xmm0 + vaesenc 208(%ecx), %xmm0, %xmm0 + vmovdqu 224(%ecx), %xmm2 +L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last: + vaesenclast %xmm2, %xmm0, %xmm0 + vmovdqu %xmm1, %xmm3 + vmovdqu (%esi), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%edi) + vmovdqu %xmm3, (%edx) + popl %edi + popl %esi + ret +.size AES_GCM_encrypt_block_avx2,.-AES_GCM_encrypt_block_avx2 +.text +.globl AES_GCM_ghash_block_avx2 +.type AES_GCM_ghash_block_avx2,@function +.align 16 +AES_GCM_ghash_block_avx2: + movl 4(%esp), %edx + movl 8(%esp), %eax + movl 12(%esp), %ecx + vmovdqu (%eax), %xmm4 + vmovdqu (%ecx), %xmm5 + vmovdqu (%edx), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm4, %xmm4 + # ghash_gfmul_avx + vpclmulqdq $16, %xmm4, %xmm5, %xmm2 + vpclmulqdq $0x01, %xmm4, %xmm5, %xmm1 + vpclmulqdq $0x00, %xmm4, %xmm5, %xmm0 + vpclmulqdq $0x11, %xmm4, %xmm5, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm6 + vpxor %xmm2, %xmm3, %xmm4 + # ghash_mid + vpsrld $31, %xmm6, %xmm0 + vpsrld $31, %xmm4, %xmm1 + vpslld $0x01, %xmm6, %xmm6 + vpslld $0x01, %xmm4, %xmm4 + vpsrldq $12, %xmm0, %xmm2 + vpslldq $4, %xmm0, %xmm0 + vpslldq $4, %xmm1, %xmm1 + vpor %xmm2, %xmm4, %xmm4 + vpor %xmm0, %xmm6, %xmm6 + vpor %xmm1, %xmm4, %xmm4 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm6, %xmm0 + vpshufd $0x4e, %xmm6, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm4, %xmm4 + vmovdqu %xmm4, (%eax) + ret +.size AES_GCM_ghash_block_avx2,.-AES_GCM_ghash_block_avx2 +.text +.globl AES_GCM_encrypt_update_avx2 +.type AES_GCM_encrypt_update_avx2,@function +.align 16 +AES_GCM_encrypt_update_avx2: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0x60, %esp + movl 144(%esp), %esi + vmovdqu (%esi), %xmm4 + vmovdqu %xmm4, 64(%esp) + movl 136(%esp), %esi + movl 140(%esp), %ebp + vmovdqu (%esi), %xmm6 + vmovdqu (%ebp), %xmm5 + vmovdqu %xmm6, 80(%esp) + movl 116(%esp), %ebp + movl 124(%esp), %edi + movl 128(%esp), %esi + # Calculate H + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + xorl %ebx, %ebx + cmpl $0x40, 132(%esp) + movl 132(%esp), %eax + jl L_AES_GCM_encrypt_update_avx2_done_64 + andl $0xffffffc0, %eax + vmovdqu %xmm4, 64(%esp) + vmovdqu %xmm6, 80(%esp) + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm3 + # H ^ 1 + vmovdqu %xmm5, (%esp) + vmovdqu %xmm5, %xmm2 + # H ^ 2 + vpclmulqdq $0x00, %xmm2, %xmm2, %xmm5 + vpclmulqdq $0x11, %xmm2, %xmm2, %xmm6 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm0 + vmovdqu %xmm0, 16(%esp) + # H ^ 3 + # ghash_gfmul_red + vpclmulqdq $16, %xmm0, %xmm2, %xmm6 + vpclmulqdq $0x01, %xmm0, %xmm2, %xmm5 + vpclmulqdq $0x00, %xmm0, %xmm2, %xmm4 + vpxor %xmm5, %xmm6, %xmm6 + vpslldq $8, %xmm6, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm0, %xmm2, %xmm1 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm1, 32(%esp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm5 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm6 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm2 + vmovdqu %xmm2, 48(%esp) + vmovdqu 80(%esp), %xmm6 + # First 64 bytes of input + # aesenc_64 + # aesenc_ctr + vmovdqu 64(%esp), %xmm4 + vmovdqu L_aes_gcm_avx2_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm1 + vpshufb %xmm7, %xmm4, %xmm0 + vpaddd L_aes_gcm_avx2_two, %xmm4, %xmm2 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx2_three, %xmm4, %xmm3 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx2_four, %xmm4, %xmm4 + vpshufb %xmm7, %xmm3, %xmm3 + # aesenc_xor + vmovdqu (%ebp), %xmm7 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqu 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 120(%esp) + vmovdqu 160(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 120(%esp) + vmovdqu 192(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 224(%ebp), %xmm7 +L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done: + # aesenc_last + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu (%esi), %xmm7 + vmovdqu 16(%esi), %xmm4 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm0, (%edi) + vmovdqu %xmm1, 16(%edi) + vmovdqu 32(%esi), %xmm7 + vmovdqu 48(%esi), %xmm4 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu %xmm2, 32(%edi) + vmovdqu %xmm3, 48(%edi) + cmpl $0x40, %eax + movl $0x40, %ebx + movl %esi, %ecx + movl %edi, %edx + jle L_AES_GCM_encrypt_update_avx2_end_64 + # More 64 bytes of input +L_AES_GCM_encrypt_update_avx2_ghash_64: + # aesenc_64_ghash + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # aesenc_64 + # aesenc_ctr + vmovdqu 64(%esp), %xmm4 + vmovdqu L_aes_gcm_avx2_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm1 + vpshufb %xmm7, %xmm4, %xmm0 + vpaddd L_aes_gcm_avx2_two, %xmm4, %xmm2 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx2_three, %xmm4, %xmm3 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx2_four, %xmm4, %xmm4 + vpshufb %xmm7, %xmm3, %xmm3 + # aesenc_xor + vmovdqu (%ebp), %xmm7 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqu 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 120(%esp) + vmovdqu 160(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 120(%esp) + vmovdqu 192(%ebp), %xmm7 + jl L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 224(%ebp), %xmm7 +L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done: + # aesenc_last + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu (%ecx), %xmm7 + vmovdqu 16(%ecx), %xmm4 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vmovdqu 32(%ecx), %xmm7 + vmovdqu 48(%ecx), %xmm4 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # pclmul_1 + vmovdqu -64(%edx), %xmm1 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vmovdqu 48(%esp), %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm5 + vpclmulqdq $0x01, %xmm2, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + # pclmul_2 + vmovdqu -48(%edx), %xmm1 + vmovdqu 32(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu -32(%edx), %xmm1 + vmovdqu 16(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu -16(%edx), %xmm1 + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # aesenc_pclmul_l + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm3, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm0 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + # aesenc_64_ghash - end + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_update_avx2_ghash_64 +L_AES_GCM_encrypt_update_avx2_end_64: + vmovdqu %xmm6, 80(%esp) + vmovdqu 48(%edx), %xmm3 + vmovdqu (%esp), %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3 + vpclmulqdq $16, %xmm3, %xmm7, %xmm5 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm1 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm4 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm6 + vpxor %xmm1, %xmm5, %xmm5 + vmovdqu 32(%edx), %xmm3 + vmovdqu 16(%esp), %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3 + vpclmulqdq $16, %xmm3, %xmm7, %xmm2 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm1 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm0 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm0, %xmm4, %xmm4 + vmovdqu 16(%edx), %xmm3 + vmovdqu 32(%esp), %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3 + vpclmulqdq $16, %xmm3, %xmm7, %xmm2 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm1 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm0 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm0, %xmm4, %xmm4 + vmovdqu 80(%esp), %xmm0 + vmovdqu (%edx), %xmm3 + vmovdqu 48(%esp), %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3 + vpxor %xmm0, %xmm3, %xmm3 + vpclmulqdq $16, %xmm3, %xmm7, %xmm2 + vpclmulqdq $0x01, %xmm3, %xmm7, %xmm1 + vpclmulqdq $0x00, %xmm3, %xmm7, %xmm0 + vpclmulqdq $0x11, %xmm3, %xmm7, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm0, %xmm4, %xmm4 + vpslldq $8, %xmm5, %xmm7 + vpsrldq $8, %xmm5, %xmm5 + vpxor %xmm7, %xmm4, %xmm4 + vpxor %xmm5, %xmm6, %xmm6 + # ghash_red + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm2 + vpclmulqdq $16, %xmm2, %xmm4, %xmm0 + vpshufd $0x4e, %xmm4, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm6, %xmm6 + vmovdqu (%esp), %xmm5 + vmovdqu 64(%esp), %xmm4 +L_AES_GCM_encrypt_update_avx2_done_64: + cmpl 132(%esp), %ebx + je L_AES_GCM_encrypt_update_avx2_done_enc + movl 132(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_update_avx2_last_block_done + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # aesenc_block + vmovdqu %xmm4, %xmm1 + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0 + vpaddd L_aes_gcm_avx2_one, %xmm1, %xmm1 + vpxor (%ebp), %xmm0, %xmm0 + vaesenc 16(%ebp), %xmm0, %xmm0 + vaesenc 32(%ebp), %xmm0, %xmm0 + vaesenc 48(%ebp), %xmm0, %xmm0 + vaesenc 64(%ebp), %xmm0, %xmm0 + vaesenc 80(%ebp), %xmm0, %xmm0 + vaesenc 96(%ebp), %xmm0, %xmm0 + vaesenc 112(%ebp), %xmm0, %xmm0 + vaesenc 128(%ebp), %xmm0, %xmm0 + vaesenc 144(%ebp), %xmm0, %xmm0 + cmpl $11, 120(%esp) + vmovdqu 160(%ebp), %xmm2 + jl L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last + vaesenc %xmm2, %xmm0, %xmm0 + vaesenc 176(%ebp), %xmm0, %xmm0 + cmpl $13, 120(%esp) + vmovdqu 192(%ebp), %xmm2 + jl L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last + vaesenc %xmm2, %xmm0, %xmm0 + vaesenc 208(%ebp), %xmm0, %xmm0 + vmovdqu 224(%ebp), %xmm2 +L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last: + vaesenclast %xmm2, %xmm0, %xmm0 + vmovdqu %xmm1, %xmm4 + vmovdqu (%ecx), %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vmovdqu %xmm0, (%edx) + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm0, %xmm6, %xmm6 + addl $16, %ebx + cmpl %eax, %ebx + jge L_AES_GCM_encrypt_update_avx2_last_block_ghash +L_AES_GCM_encrypt_update_avx2_last_block_start: + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm4 + vmovdqu %xmm4, 64(%esp) + # aesenc_gfmul_sb + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm2 + vpclmulqdq $16, %xmm5, %xmm6, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm1 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm4 + vpxor (%ebp), %xmm7, %xmm7 + vaesenc 16(%ebp), %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpslldq $8, %xmm3, %xmm2 + vpsrldq $8, %xmm3, %xmm3 + vaesenc 32(%ebp), %xmm7, %xmm7 + vpxor %xmm1, %xmm2, %xmm2 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1 + vaesenc 48(%ebp), %xmm7, %xmm7 + vaesenc 64(%ebp), %xmm7, %xmm7 + vaesenc 80(%ebp), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1 + vaesenc 96(%ebp), %xmm7, %xmm7 + vaesenc 112(%ebp), %xmm7, %xmm7 + vaesenc 128(%ebp), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vaesenc 144(%ebp), %xmm7, %xmm7 + vpxor %xmm3, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + vmovdqu 160(%ebp), %xmm0 + cmpl $11, 120(%esp) + jl L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 176(%ebp), %xmm7, %xmm7 + vmovdqu 192(%ebp), %xmm0 + cmpl $13, 120(%esp) + jl L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 208(%ebp), %xmm7, %xmm7 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last: + vaesenclast %xmm0, %xmm7, %xmm7 + vmovdqu (%esi,%ebx,1), %xmm3 + vpxor %xmm1, %xmm2, %xmm6 + vpxor %xmm3, %xmm7, %xmm7 + vmovdqu %xmm7, (%edi,%ebx,1) + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm7, %xmm7 + vpxor %xmm7, %xmm6, %xmm6 + vmovdqu 64(%esp), %xmm4 + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_encrypt_update_avx2_last_block_start +L_AES_GCM_encrypt_update_avx2_last_block_ghash: + # ghash_gfmul_red + vpclmulqdq $16, %xmm5, %xmm6, %xmm2 + vpclmulqdq $0x01, %xmm5, %xmm6, %xmm1 + vpclmulqdq $0x00, %xmm5, %xmm6, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpslldq $8, %xmm2, %xmm1 + vpsrldq $8, %xmm2, %xmm2 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $0x11, %xmm5, %xmm6, %xmm6 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm0, %xmm1, %xmm1 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0 + vpshufd $0x4e, %xmm1, %xmm1 + vpxor %xmm2, %xmm6, %xmm6 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm0, %xmm6, %xmm6 +L_AES_GCM_encrypt_update_avx2_last_block_done: +L_AES_GCM_encrypt_update_avx2_done_enc: + movl 136(%esp), %esi + movl 144(%esp), %edi + vmovdqu %xmm6, (%esi) + vmovdqu %xmm4, (%edi) + addl $0x60, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_encrypt_update_avx2,.-AES_GCM_encrypt_update_avx2 +.text +.globl AES_GCM_encrypt_final_avx2 +.type AES_GCM_encrypt_final_avx2,@function +.align 16 +AES_GCM_encrypt_final_avx2: + pushl %esi + pushl %edi + pushl %ebp + subl $16, %esp + movl 32(%esp), %ebp + movl 52(%esp), %esi + movl 56(%esp), %edi + vmovdqu (%ebp), %xmm4 + vmovdqu (%esi), %xmm5 + vmovdqu (%edi), %xmm6 + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + # calc_tag + movl 44(%esp), %ecx + shll $3, %ecx + vpinsrd $0x00, %ecx, %xmm0, %xmm0 + movl 48(%esp), %ecx + shll $3, %ecx + vpinsrd $2, %ecx, %xmm0, %xmm0 + movl 44(%esp), %ecx + shrl $29, %ecx + vpinsrd $0x01, %ecx, %xmm0, %xmm0 + movl 48(%esp), %ecx + shrl $29, %ecx + vpinsrd $3, %ecx, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + # ghash_gfmul_red + vpclmulqdq $16, %xmm5, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm5, %xmm0, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm2 + vpxor %xmm3, %xmm7, %xmm7 + vpslldq $8, %xmm7, %xmm3 + vpsrldq $8, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm0 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm3, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm6, %xmm0, %xmm0 + movl 36(%esp), %edi + # store_tag + cmpl $16, 40(%esp) + je L_AES_GCM_encrypt_final_avx2_store_tag_16 + xorl %ecx, %ecx + vmovdqu %xmm0, (%esp) +L_AES_GCM_encrypt_final_avx2_store_tag_loop: + movzbl (%esp,%ecx,1), %eax + movb %al, (%edi,%ecx,1) + incl %ecx + cmpl 40(%esp), %ecx + jne L_AES_GCM_encrypt_final_avx2_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx2_store_tag_done +L_AES_GCM_encrypt_final_avx2_store_tag_16: + vmovdqu %xmm0, (%edi) +L_AES_GCM_encrypt_final_avx2_store_tag_done: + addl $16, %esp + popl %ebp + popl %edi + popl %esi + ret +.size AES_GCM_encrypt_final_avx2,.-AES_GCM_encrypt_final_avx2 +.text +.globl AES_GCM_decrypt_update_avx2 +.type AES_GCM_decrypt_update_avx2,@function +.align 16 +AES_GCM_decrypt_update_avx2: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $0xa0, %esp + movl 208(%esp), %esi + vmovdqu (%esi), %xmm4 + movl 200(%esp), %esi + movl 204(%esp), %ebp + vmovdqu (%esi), %xmm6 + vmovdqu (%ebp), %xmm5 + movl 180(%esp), %ebp + movl 188(%esp), %edi + movl 192(%esp), %esi + # Calculate H + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + xorl %ebx, %ebx + cmpl $0x40, 196(%esp) + movl 196(%esp), %eax + jl L_AES_GCM_decrypt_update_avx2_done_64 + andl $0xffffffc0, %eax + vmovdqu %xmm4, 64(%esp) + vmovdqu %xmm6, 80(%esp) + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm3 + # H ^ 1 + vmovdqu %xmm5, (%esp) + vmovdqu %xmm5, %xmm2 + # H ^ 2 + vpclmulqdq $0x00, %xmm2, %xmm2, %xmm5 + vpclmulqdq $0x11, %xmm2, %xmm2, %xmm6 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm0 + vmovdqu %xmm0, 16(%esp) + # H ^ 3 + # ghash_gfmul_red + vpclmulqdq $16, %xmm0, %xmm2, %xmm6 + vpclmulqdq $0x01, %xmm0, %xmm2, %xmm5 + vpclmulqdq $0x00, %xmm0, %xmm2, %xmm4 + vpxor %xmm5, %xmm6, %xmm6 + vpslldq $8, %xmm6, %xmm5 + vpsrldq $8, %xmm6, %xmm6 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $0x11, %xmm0, %xmm2, %xmm1 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm6, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm1, 32(%esp) + # H ^ 4 + vpclmulqdq $0x00, %xmm0, %xmm0, %xmm5 + vpclmulqdq $0x11, %xmm0, %xmm0, %xmm6 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpclmulqdq $16, %xmm3, %xmm5, %xmm4 + vpshufd $0x4e, %xmm5, %xmm5 + vpxor %xmm4, %xmm5, %xmm5 + vpxor %xmm5, %xmm6, %xmm2 + vmovdqu %xmm2, 48(%esp) + vmovdqu 80(%esp), %xmm6 + cmpl %esi, %edi + jne L_AES_GCM_decrypt_update_avx2_ghash_64 +L_AES_GCM_decrypt_update_avx2_ghash_64_inplace: + # aesenc_64_ghash + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # aesenc_64 + # aesenc_ctr + vmovdqu 64(%esp), %xmm4 + vmovdqu L_aes_gcm_avx2_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm1 + vpshufb %xmm7, %xmm4, %xmm0 + vpaddd L_aes_gcm_avx2_two, %xmm4, %xmm2 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx2_three, %xmm4, %xmm3 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx2_four, %xmm4, %xmm4 + vpshufb %xmm7, %xmm3, %xmm3 + # aesenc_xor + vmovdqu (%ebp), %xmm7 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqu 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 184(%esp) + vmovdqu 160(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 184(%esp) + vmovdqu 192(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 224(%ebp), %xmm7 +L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done: + # aesenc_last + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu (%ecx), %xmm7 + vmovdqu 16(%ecx), %xmm4 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm7, 96(%esp) + vmovdqu %xmm4, 112(%esp) + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vmovdqu 32(%ecx), %xmm7 + vmovdqu 48(%ecx), %xmm4 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu %xmm7, 128(%esp) + vmovdqu %xmm4, 144(%esp) + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # pclmul_1 + vmovdqu 96(%esp), %xmm1 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vmovdqu 48(%esp), %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm5 + vpclmulqdq $0x01, %xmm2, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + # pclmul_2 + vmovdqu 112(%esp), %xmm1 + vmovdqu 32(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu 128(%esp), %xmm1 + vmovdqu 16(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu 144(%esp), %xmm1 + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # aesenc_pclmul_l + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm3, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm0 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + # aesenc_64_ghash - end + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_avx2_ghash_64_inplace + jmp L_AES_GCM_decrypt_update_avx2_ghash_64_done +L_AES_GCM_decrypt_update_avx2_ghash_64: + # aesenc_64_ghash + leal (%esi,%ebx,1), %ecx + leal (%edi,%ebx,1), %edx + # aesenc_64 + # aesenc_ctr + vmovdqu 64(%esp), %xmm4 + vmovdqu L_aes_gcm_avx2_bswap_epi64, %xmm7 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm1 + vpshufb %xmm7, %xmm4, %xmm0 + vpaddd L_aes_gcm_avx2_two, %xmm4, %xmm2 + vpshufb %xmm7, %xmm1, %xmm1 + vpaddd L_aes_gcm_avx2_three, %xmm4, %xmm3 + vpshufb %xmm7, %xmm2, %xmm2 + vpaddd L_aes_gcm_avx2_four, %xmm4, %xmm4 + vpshufb %xmm7, %xmm3, %xmm3 + # aesenc_xor + vmovdqu (%ebp), %xmm7 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm7, %xmm1, %xmm1 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm7, %xmm3, %xmm3 + vmovdqu 16(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 32(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 48(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 64(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 80(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 96(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 112(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 128(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 144(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $11, 184(%esp) + vmovdqu 160(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 176(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + cmpl $13, 184(%esp) + vmovdqu 192(%ebp), %xmm7 + jl L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 208(%ebp), %xmm7 + vaesenc %xmm7, %xmm0, %xmm0 + vaesenc %xmm7, %xmm1, %xmm1 + vaesenc %xmm7, %xmm2, %xmm2 + vaesenc %xmm7, %xmm3, %xmm3 + vmovdqu 224(%ebp), %xmm7 +L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done: + # aesenc_last + vaesenclast %xmm7, %xmm0, %xmm0 + vaesenclast %xmm7, %xmm1, %xmm1 + vaesenclast %xmm7, %xmm2, %xmm2 + vaesenclast %xmm7, %xmm3, %xmm3 + vmovdqu (%ecx), %xmm7 + vmovdqu 16(%ecx), %xmm4 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm4, %xmm1, %xmm1 + vmovdqu %xmm7, (%ecx) + vmovdqu %xmm4, 16(%ecx) + vmovdqu %xmm0, (%edx) + vmovdqu %xmm1, 16(%edx) + vmovdqu 32(%ecx), %xmm7 + vmovdqu 48(%ecx), %xmm4 + vpxor %xmm7, %xmm2, %xmm2 + vpxor %xmm4, %xmm3, %xmm3 + vmovdqu %xmm7, 32(%ecx) + vmovdqu %xmm4, 48(%ecx) + vmovdqu %xmm2, 32(%edx) + vmovdqu %xmm3, 48(%edx) + # pclmul_1 + vmovdqu (%ecx), %xmm1 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vmovdqu 48(%esp), %xmm2 + vpxor %xmm6, %xmm1, %xmm1 + vpclmulqdq $16, %xmm2, %xmm1, %xmm5 + vpclmulqdq $0x01, %xmm2, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm2, %xmm1, %xmm6 + vpclmulqdq $0x11, %xmm2, %xmm1, %xmm7 + # pclmul_2 + vmovdqu 16(%ecx), %xmm1 + vmovdqu 32(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu 32(%ecx), %xmm1 + vmovdqu 16(%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # pclmul_n + vmovdqu 48(%ecx), %xmm1 + vmovdqu (%esp), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1 + vpxor %xmm2, %xmm5, %xmm5 + vpclmulqdq $16, %xmm0, %xmm1, %xmm2 + vpxor %xmm3, %xmm5, %xmm5 + vpclmulqdq $0x01, %xmm0, %xmm1, %xmm3 + vpxor %xmm4, %xmm6, %xmm6 + vpclmulqdq $0x00, %xmm0, %xmm1, %xmm4 + vpclmulqdq $0x11, %xmm0, %xmm1, %xmm1 + vpxor %xmm1, %xmm7, %xmm7 + # aesenc_pclmul_l + vpxor %xmm2, %xmm5, %xmm5 + vpxor %xmm4, %xmm6, %xmm6 + vpxor %xmm3, %xmm5, %xmm5 + vpslldq $8, %xmm5, %xmm1 + vpsrldq $8, %xmm5, %xmm5 + vmovdqu L_aes_gcm_avx2_mod2_128, %xmm0 + vpxor %xmm1, %xmm6, %xmm6 + vpxor %xmm5, %xmm7, %xmm7 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpclmulqdq $16, %xmm0, %xmm6, %xmm3 + vpshufd $0x4e, %xmm6, %xmm6 + vpxor %xmm3, %xmm6, %xmm6 + vpxor %xmm7, %xmm6, %xmm6 + # aesenc_64_ghash - end + addl $0x40, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_avx2_ghash_64 +L_AES_GCM_decrypt_update_avx2_ghash_64_done: + vmovdqu (%esp), %xmm5 + vmovdqu 64(%esp), %xmm4 +L_AES_GCM_decrypt_update_avx2_done_64: + cmpl 196(%esp), %ebx + jge L_AES_GCM_decrypt_update_avx2_done_dec + movl 196(%esp), %eax + andl $0xfffffff0, %eax + cmpl %eax, %ebx + jge L_AES_GCM_decrypt_update_avx2_last_block_done +L_AES_GCM_decrypt_update_avx2_last_block_start: + vmovdqu (%esi,%ebx,1), %xmm0 + vpshufb L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpaddd L_aes_gcm_avx2_one, %xmm4, %xmm4 + vmovdqu %xmm4, 64(%esp) + vpxor %xmm6, %xmm0, %xmm4 + # aesenc_gfmul_sb + vpclmulqdq $0x01, %xmm5, %xmm4, %xmm2 + vpclmulqdq $16, %xmm5, %xmm4, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm4, %xmm1 + vpclmulqdq $0x11, %xmm5, %xmm4, %xmm4 + vpxor (%ebp), %xmm7, %xmm7 + vaesenc 16(%ebp), %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpslldq $8, %xmm3, %xmm2 + vpsrldq $8, %xmm3, %xmm3 + vaesenc 32(%ebp), %xmm7, %xmm7 + vpxor %xmm1, %xmm2, %xmm2 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1 + vaesenc 48(%ebp), %xmm7, %xmm7 + vaesenc 64(%ebp), %xmm7, %xmm7 + vaesenc 80(%ebp), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1 + vaesenc 96(%ebp), %xmm7, %xmm7 + vaesenc 112(%ebp), %xmm7, %xmm7 + vaesenc 128(%ebp), %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vaesenc 144(%ebp), %xmm7, %xmm7 + vpxor %xmm3, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + vmovdqu 160(%ebp), %xmm0 + cmpl $11, 184(%esp) + jl L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 176(%ebp), %xmm7, %xmm7 + vmovdqu 192(%ebp), %xmm0 + cmpl $13, 184(%esp) + jl L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc %xmm0, %xmm7, %xmm7 + vaesenc 208(%ebp), %xmm7, %xmm7 + vmovdqu 224(%ebp), %xmm0 +L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last: + vaesenclast %xmm0, %xmm7, %xmm7 + vmovdqu (%esi,%ebx,1), %xmm3 + vpxor %xmm1, %xmm2, %xmm6 + vpxor %xmm3, %xmm7, %xmm7 + vmovdqu %xmm7, (%edi,%ebx,1) + vmovdqu 64(%esp), %xmm4 + addl $16, %ebx + cmpl %eax, %ebx + jl L_AES_GCM_decrypt_update_avx2_last_block_start +L_AES_GCM_decrypt_update_avx2_last_block_done: +L_AES_GCM_decrypt_update_avx2_done_dec: + movl 200(%esp), %esi + movl 208(%esp), %edi + vmovdqu 64(%esp), %xmm4 + vmovdqu %xmm6, (%esi) + vmovdqu %xmm4, (%edi) + addl $0xa0, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_update_avx2,.-AES_GCM_decrypt_update_avx2 +.text +.globl AES_GCM_decrypt_final_avx2 +.type AES_GCM_decrypt_final_avx2,@function +.align 16 +AES_GCM_decrypt_final_avx2: + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + subl $16, %esp + movl 36(%esp), %ebp + movl 56(%esp), %esi + movl 60(%esp), %edi + vmovdqu (%ebp), %xmm4 + vmovdqu (%esi), %xmm5 + vmovdqu (%edi), %xmm6 + vpsrlq $63, %xmm5, %xmm1 + vpsllq $0x01, %xmm5, %xmm0 + vpslldq $8, %xmm1, %xmm1 + vpor %xmm1, %xmm0, %xmm0 + vpshufd $0xff, %xmm5, %xmm5 + vpsrad $31, %xmm5, %xmm5 + vpand L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5 + vpxor %xmm0, %xmm5, %xmm5 + # calc_tag + movl 48(%esp), %ecx + shll $3, %ecx + vpinsrd $0x00, %ecx, %xmm0, %xmm0 + movl 52(%esp), %ecx + shll $3, %ecx + vpinsrd $2, %ecx, %xmm0, %xmm0 + movl 48(%esp), %ecx + shrl $29, %ecx + vpinsrd $0x01, %ecx, %xmm0, %xmm0 + movl 52(%esp), %ecx + shrl $29, %ecx + vpinsrd $3, %ecx, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + # ghash_gfmul_red + vpclmulqdq $16, %xmm5, %xmm0, %xmm7 + vpclmulqdq $0x01, %xmm5, %xmm0, %xmm3 + vpclmulqdq $0x00, %xmm5, %xmm0, %xmm2 + vpxor %xmm3, %xmm7, %xmm7 + vpslldq $8, %xmm7, %xmm3 + vpsrldq $8, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpclmulqdq $0x11, %xmm5, %xmm0, %xmm0 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpclmulqdq $16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2 + vpshufd $0x4e, %xmm3, %xmm3 + vpxor %xmm7, %xmm0, %xmm0 + vpxor %xmm3, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpshufb L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0 + vpxor %xmm6, %xmm0, %xmm0 + movl 40(%esp), %esi + movl 64(%esp), %edi + # cmp_tag + cmpl $16, 44(%esp) + je L_AES_GCM_decrypt_final_avx2_cmp_tag_16 + xorl %ecx, %ecx + xorl %edx, %edx + vmovdqu %xmm0, (%esp) +L_AES_GCM_decrypt_final_avx2_cmp_tag_loop: + movzbl (%esp,%ecx,1), %eax + xorb (%esi,%ecx,1), %al + orb %al, %dl + incl %ecx + cmpl 44(%esp), %ecx + jne L_AES_GCM_decrypt_final_avx2_cmp_tag_loop + cmpb $0x00, %dl + sete %dl + jmp L_AES_GCM_decrypt_final_avx2_cmp_tag_done +L_AES_GCM_decrypt_final_avx2_cmp_tag_16: + vmovdqu (%esi), %xmm1 + vpcmpeqb %xmm1, %xmm0, %xmm0 + vpmovmskb %xmm0, %ecx + # %%edx == 0xFFFF then return 1 else => return 0 + xorl %edx, %edx + cmpl $0xffff, %ecx + sete %dl +L_AES_GCM_decrypt_final_avx2_cmp_tag_done: + movl %edx, (%edi) + addl $16, %esp + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +.size AES_GCM_decrypt_final_avx2,.-AES_GCM_decrypt_final_avx2 +#endif /* WOLFSSL_AESGCM_STREAM */ +#endif /* HAVE_INTEL_AVX2 */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S new file mode 100644 index 000000000..0a966b769 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S @@ -0,0 +1,2403 @@ +/* armv8-32-sha3-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha3/sha3.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2) +#ifndef WOLFSSL_ARMASM_INLINE +#ifdef WOLFSSL_SHA3 +#ifndef WOLFSSL_ARMASM_NO_NEON + .text + .type L_sha3_arm2_neon_rt, %object + .size L_sha3_arm2_neon_rt, 192 + .align 4 +L_sha3_arm2_neon_rt: + .word 0x1 + .word 0x0 + .word 0x8082 + .word 0x0 + .word 0x808a + .word 0x80000000 + .word 0x80008000 + .word 0x80000000 + .word 0x808b + .word 0x0 + .word 0x80000001 + .word 0x0 + .word 0x80008081 + .word 0x80000000 + .word 0x8009 + .word 0x80000000 + .word 0x8a + .word 0x0 + .word 0x88 + .word 0x0 + .word 0x80008009 + .word 0x0 + .word 0x8000000a + .word 0x0 + .word 0x8000808b + .word 0x0 + .word 0x8b + .word 0x80000000 + .word 0x8089 + .word 0x80000000 + .word 0x8003 + .word 0x80000000 + .word 0x8002 + .word 0x80000000 + .word 0x80 + .word 0x80000000 + .word 0x800a + .word 0x0 + .word 0x8000000a + .word 0x80000000 + .word 0x80008081 + .word 0x80000000 + .word 0x8080 + .word 0x80000000 + .word 0x80000001 + .word 0x0 + .word 0x80008008 + .word 0x80000000 + .text + .align 4 + .globl BlockSha3 + .type BlockSha3, %function +BlockSha3: + vpush {d8-d15} + sub sp, sp, #16 + adr r1, L_sha3_arm2_neon_rt + mov r2, #24 + mov r3, sp + vld1.8 {d0-d3}, [r0]! + vld1.8 {d4-d7}, [r0]! + vld1.8 {d8-d11}, [r0]! + vld1.8 {d12-d15}, [r0]! + vld1.8 {d16-d19}, [r0]! + vld1.8 {d20-d23}, [r0]! + vld1.8 {d24}, [r0] + sub r0, r0, #0xc0 +L_sha3_arm32_neon_begin: + # Calc b[0..4] + veor d26, d0, d5 + veor d27, d1, d6 + veor d28, d2, d7 + veor d29, d3, d8 + veor d25, d4, d9 + veor d26, d26, d10 + veor d27, d27, d11 + veor d28, d28, d12 + veor d29, d29, d13 + veor d25, d25, d14 + veor d26, d26, d15 + veor d27, d27, d16 + veor d28, d28, d17 + veor d29, d29, d18 + veor d25, d25, d19 + veor d26, d26, d20 + veor d27, d27, d21 + veor d28, d28, d22 + veor d29, d29, d23 + veor d25, d25, d24 + vst1.8 {d25, d26}, [r3] + # Calc t[0..4] and XOR into s[i*5..i*5+4] + # t[0] + vshr.u64 d30, d27, #63 + vshl.u64 d31, d27, #1 + veor d25, d25, d30 + veor d25, d25, d31 + # t[1] + vshr.u64 d30, d28, #63 + vshl.u64 d31, d28, #1 + veor d26, d26, d30 + veor d26, d26, d31 + # t[2] + vshr.u64 d30, d29, #63 + vshl.u64 d31, d29, #1 + veor d27, d27, d30 + veor d27, d27, d31 + # t[3] + vldr.8 d31, [r3] + vshr.u64 d30, d31, #63 + vshl.u64 d31, d31, #1 + veor d28, d28, d30 + veor d28, d28, d31 + # t[4] + vldr.8 d31, [r3, #8] + vshr.u64 d30, d31, #63 + vshl.u64 d31, d31, #1 + veor d29, d29, d30 + veor d29, d29, d31 + sub r3, r3, #16 + veor d0, d0, d25 + # s[1] => s[10] (tmp) + veor d30, d1, d26 + vshr.u64 d31, d30, #63 + vshl.u64 d30, d30, #1 + veor d30, d30, d31 + # s[6] => s[1] + veor d1, d6, d26 + vshr.u64 d31, d1, #20 + vshl.u64 d1, d1, #44 + veor d1, d1, d31 + # s[9] => s[6] + veor d6, d9, d29 + vshr.u64 d31, d6, #44 + vshl.u64 d6, d6, #20 + veor d6, d6, d31 + # s[22] => s[9] + veor d9, d22, d27 + vshr.u64 d31, d9, #3 + vshl.u64 d9, d9, #61 + veor d9, d9, d31 + # s[14] => s[22] + veor d22, d14, d29 + vshr.u64 d31, d22, #25 + vshl.u64 d22, d22, #39 + veor d22, d22, d31 + # s[20] => s[14] + veor d14, d20, d25 + vshr.u64 d31, d14, #46 + vshl.u64 d14, d14, #18 + veor d14, d14, d31 + # s[2] => s[20] + veor d20, d2, d27 + vshr.u64 d31, d20, #2 + vshl.u64 d20, d20, #62 + veor d20, d20, d31 + # s[12] => s[2] + veor d2, d12, d27 + vshr.u64 d31, d2, #21 + vshl.u64 d2, d2, #43 + veor d2, d2, d31 + # s[13] => s[12] + veor d12, d13, d28 + vshr.u64 d31, d12, #39 + vshl.u64 d12, d12, #25 + veor d12, d12, d31 + # s[19] => s[13] + veor d13, d19, d29 + vshr.u64 d31, d13, #56 + vshl.u64 d13, d13, #8 + veor d13, d13, d31 + # s[23] => s[19] + veor d19, d23, d28 + vshr.u64 d31, d19, #8 + vshl.u64 d19, d19, #56 + veor d19, d19, d31 + # s[15] => s[23] + veor d23, d15, d25 + vshr.u64 d31, d23, #23 + vshl.u64 d23, d23, #41 + veor d23, d23, d31 + # s[4] => s[15] + veor d15, d4, d29 + vshr.u64 d31, d15, #37 + vshl.u64 d15, d15, #27 + veor d15, d15, d31 + # s[24] => s[4] + veor d4, d24, d29 + vshr.u64 d31, d4, #50 + vshl.u64 d4, d4, #14 + veor d4, d4, d31 + # s[21] => s[24] + veor d24, d21, d26 + vshr.u64 d31, d24, #62 + vshl.u64 d24, d24, #2 + veor d24, d24, d31 + # s[8] => s[21] + veor d21, d8, d28 + vshr.u64 d31, d21, #9 + vshl.u64 d21, d21, #55 + veor d21, d21, d31 + # s[16] => s[8] + veor d8, d16, d26 + vshr.u64 d31, d8, #19 + vshl.u64 d8, d8, #45 + veor d8, d8, d31 + # s[5] => s[16] + veor d16, d5, d25 + vshr.u64 d31, d16, #28 + vshl.u64 d16, d16, #36 + veor d16, d16, d31 + # s[3] => s[5] + veor d5, d3, d28 + vshr.u64 d31, d5, #36 + vshl.u64 d5, d5, #28 + veor d5, d5, d31 + # s[18] => s[3] + veor d3, d18, d28 + vshr.u64 d31, d3, #43 + vshl.u64 d3, d3, #21 + veor d3, d3, d31 + # s[17] => s[18] + veor d18, d17, d27 + vshr.u64 d31, d18, #49 + vshl.u64 d18, d18, #15 + veor d18, d18, d31 + # s[11] => s[17] + veor d17, d11, d26 + vshr.u64 d31, d17, #54 + vshl.u64 d17, d17, #10 + veor d17, d17, d31 + # s[7] => s[11] + veor d11, d7, d27 + vshr.u64 d31, d11, #58 + vshl.u64 d11, d11, #6 + veor d11, d11, d31 + # s[10] => s[7] + veor d7, d10, d25 + vshr.u64 d31, d7, #61 + vshl.u64 d7, d7, #3 + veor d7, d7, d31 + # Row Mix + vmov d25, d0 + vmov d26, d1 + vbic d31, d2, d26 + veor d0, d25, d31 + vbic d31, d3, d2 + veor d1, d26, d31 + vbic d31, d4, d3 + veor d2, d2, d31 + vbic d31, d25, d4 + veor d3, d3, d31 + vbic d31, d26, d25 + veor d4, d4, d31 + vmov d25, d5 + vmov d26, d6 + vbic d31, d7, d26 + veor d5, d25, d31 + vbic d31, d8, d7 + veor d6, d26, d31 + vbic d31, d9, d8 + veor d7, d7, d31 + vbic d31, d25, d9 + veor d8, d8, d31 + vbic d31, d26, d25 + veor d9, d9, d31 + vmov d26, d11 + vbic d31, d12, d26 + veor d10, d30, d31 + vbic d31, d13, d12 + veor d11, d26, d31 + vbic d31, d14, d13 + veor d12, d12, d31 + vbic d31, d30, d14 + veor d13, d13, d31 + vbic d31, d26, d30 + veor d14, d14, d31 + vmov d25, d15 + vmov d26, d16 + vbic d31, d17, d26 + veor d15, d25, d31 + vbic d31, d18, d17 + veor d16, d26, d31 + vbic d31, d19, d18 + veor d17, d17, d31 + vbic d31, d25, d19 + veor d18, d18, d31 + vbic d31, d26, d25 + veor d19, d19, d31 + vmov d25, d20 + vmov d26, d21 + vbic d31, d22, d26 + veor d20, d25, d31 + vbic d31, d23, d22 + veor d21, d26, d31 + vbic d31, d24, d23 + veor d22, d22, d31 + vbic d31, d25, d24 + veor d23, d23, d31 + vbic d31, d26, d25 + veor d24, d24, d31 + vld1.8 {d30}, [r1]! + subs r2, r2, #1 + veor d0, d0, d30 + bne L_sha3_arm32_neon_begin + vst1.8 {d0-d3}, [r0]! + vst1.8 {d4-d7}, [r0]! + vst1.8 {d8-d11}, [r0]! + vst1.8 {d12-d15}, [r0]! + vst1.8 {d16-d19}, [r0]! + vst1.8 {d20-d23}, [r0]! + vst1.8 {d24}, [r0] + add sp, sp, #16 + vpop {d8-d15} + bx lr + .size BlockSha3,.-BlockSha3 +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_sha3_arm2_rt, %object + .size L_sha3_arm2_rt, 192 + .align 4 +L_sha3_arm2_rt: + .word 0x1 + .word 0x0 + .word 0x8082 + .word 0x0 + .word 0x808a + .word 0x80000000 + .word 0x80008000 + .word 0x80000000 + .word 0x808b + .word 0x0 + .word 0x80000001 + .word 0x0 + .word 0x80008081 + .word 0x80000000 + .word 0x8009 + .word 0x80000000 + .word 0x8a + .word 0x0 + .word 0x88 + .word 0x0 + .word 0x80008009 + .word 0x0 + .word 0x8000000a + .word 0x0 + .word 0x8000808b + .word 0x0 + .word 0x8b + .word 0x80000000 + .word 0x8089 + .word 0x80000000 + .word 0x8003 + .word 0x80000000 + .word 0x8002 + .word 0x80000000 + .word 0x80 + .word 0x80000000 + .word 0x800a + .word 0x0 + .word 0x8000000a + .word 0x80000000 + .word 0x80008081 + .word 0x80000000 + .word 0x8080 + .word 0x80000000 + .word 0x80000001 + .word 0x0 + .word 0x80008008 + .word 0x80000000 + .text + .align 4 + .globl BlockSha3 + .type BlockSha3, %function +BlockSha3: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0xcc + adr r1, L_sha3_arm2_rt + mov r2, #12 +L_sha3_arm32_begin: + str r2, [sp, #200] + # Round even + # Calc b[4] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #32] + ldr r5, [r0, #36] +#else + ldrd r4, r5, [r0, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #72] + ldr r7, [r0, #76] +#else + ldrd r6, r7, [r0, #72] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #112] + ldr r9, [r0, #116] +#else + ldrd r8, r9, [r0, #112] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #152] + ldr r11, [r0, #156] +#else + ldrd r10, r11, [r0, #152] +#endif + ldr r12, [r0, #192] + ldr lr, [r0, #196] + eor r2, r4, r6 + eor r3, r5, r7 + eor r2, r2, r8 + eor r3, r3, r9 + eor r2, r2, r10 + eor r3, r3, r11 + eor r2, r2, r12 + eor r3, r3, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r2, [sp, #32] + str r3, [sp, #36] +#else + strd r2, r3, [sp, #32] +#endif + # Calc b[1] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #8] + ldr r5, [r0, #12] +#else + ldrd r4, r5, [r0, #8] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #48] + ldr r7, [r0, #52] +#else + ldrd r6, r7, [r0, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #88] + ldr r9, [r0, #92] +#else + ldrd r8, r9, [r0, #88] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #128] + ldr r11, [r0, #132] +#else + ldrd r10, r11, [r0, #128] +#endif + ldr r12, [r0, #168] + ldr lr, [r0, #172] + eor r4, r4, r6 + eor r5, r5, r7 + eor r4, r4, r8 + eor r5, r5, r9 + eor r4, r4, r10 + eor r5, r5, r11 + eor r4, r4, r12 + eor r5, r5, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #8] + str r5, [sp, #12] +#else + strd r4, r5, [sp, #8] +#endif + # Calc t[0] + eor r2, r2, r5, lsr #31 + eor r3, r3, r4, lsr #31 + eor r2, r2, r4, lsl #1 + eor r3, r3, r5, lsl #1 + # Calc b[0] and XOR t[0] into s[x*5+0] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0] + ldr r5, [r0, #4] +#else + ldrd r4, r5, [r0] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #40] + ldr r7, [r0, #44] +#else + ldrd r6, r7, [r0, #40] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #80] + ldr r9, [r0, #84] +#else + ldrd r8, r9, [r0, #80] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #120] + ldr r11, [r0, #124] +#else + ldrd r10, r11, [r0, #120] +#endif + eor r12, r4, r6 + eor lr, r5, r7 + eor r12, r12, r8 + eor lr, lr, r9 + eor r12, r12, r10 + eor lr, lr, r11 + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [r0] + str r5, [r0, #4] +#else + strd r4, r5, [r0] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [r0, #40] + str r7, [r0, #44] +#else + strd r6, r7, [r0, #40] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [r0, #80] + str r9, [r0, #84] +#else + strd r8, r9, [r0, #80] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [r0, #120] + str r11, [r0, #124] +#else + strd r10, r11, [r0, #120] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #160] + ldr r11, [r0, #164] +#else + ldrd r10, r11, [r0, #160] +#endif + eor r12, r12, r10 + eor lr, lr, r11 + eor r10, r10, r2 + eor r11, r11, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [r0, #160] + str r11, [r0, #164] +#else + strd r10, r11, [r0, #160] +#endif + str r12, [sp] + str lr, [sp, #4] + # Calc b[3] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #24] + ldr r5, [r0, #28] +#else + ldrd r4, r5, [r0, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #64] + ldr r7, [r0, #68] +#else + ldrd r6, r7, [r0, #64] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #104] + ldr r9, [r0, #108] +#else + ldrd r8, r9, [r0, #104] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #144] + ldr r11, [r0, #148] +#else + ldrd r10, r11, [r0, #144] +#endif + ldr r12, [r0, #184] + ldr lr, [r0, #188] + eor r4, r4, r6 + eor r5, r5, r7 + eor r4, r4, r8 + eor r5, r5, r9 + eor r4, r4, r10 + eor r5, r5, r11 + eor r4, r4, r12 + eor r5, r5, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #24] + str r5, [sp, #28] +#else + strd r4, r5, [sp, #24] +#endif + # Calc t[2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #8] + ldr r3, [sp, #12] +#else + ldrd r2, r3, [sp, #8] +#endif + eor r2, r2, r5, lsr #31 + eor r3, r3, r4, lsr #31 + eor r2, r2, r4, lsl #1 + eor r3, r3, r5, lsl #1 + # Calc b[2] and XOR t[2] into s[x*5+2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #16] + ldr r5, [r0, #20] +#else + ldrd r4, r5, [r0, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #56] + ldr r7, [r0, #60] +#else + ldrd r6, r7, [r0, #56] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #96] + ldr r9, [r0, #100] +#else + ldrd r8, r9, [r0, #96] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #136] + ldr r11, [r0, #140] +#else + ldrd r10, r11, [r0, #136] +#endif + eor r12, r4, r6 + eor lr, r5, r7 + eor r12, r12, r8 + eor lr, lr, r9 + eor r12, r12, r10 + eor lr, lr, r11 + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [r0, #16] + str r5, [r0, #20] +#else + strd r4, r5, [r0, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [r0, #56] + str r7, [r0, #60] +#else + strd r6, r7, [r0, #56] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [r0, #96] + str r9, [r0, #100] +#else + strd r8, r9, [r0, #96] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [r0, #136] + str r11, [r0, #140] +#else + strd r10, r11, [r0, #136] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #176] + ldr r11, [r0, #180] +#else + ldrd r10, r11, [r0, #176] +#endif + eor r12, r12, r10 + eor lr, lr, r11 + eor r10, r10, r2 + eor r11, r11, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [r0, #176] + str r11, [r0, #180] +#else + strd r10, r11, [r0, #176] +#endif + str r12, [sp, #16] + str lr, [sp, #20] + # Calc t[1] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp] + ldr r3, [sp, #4] +#else + ldrd r2, r3, [sp] +#endif + eor r2, r2, lr, lsr #31 + eor r3, r3, r12, lsr #31 + eor r2, r2, r12, lsl #1 + eor r3, r3, lr, lsl #1 + # XOR t[1] into s[x*5+1] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #8] + ldr r5, [r0, #12] +#else + ldrd r4, r5, [r0, #8] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #48] + ldr r7, [r0, #52] +#else + ldrd r6, r7, [r0, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #88] + ldr r9, [r0, #92] +#else + ldrd r8, r9, [r0, #88] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #128] + ldr r11, [r0, #132] +#else + ldrd r10, r11, [r0, #128] +#endif + ldr r12, [r0, #168] + ldr lr, [r0, #172] + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 + eor r12, r12, r2 + eor lr, lr, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [r0, #8] + str r5, [r0, #12] +#else + strd r4, r5, [r0, #8] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [r0, #48] + str r7, [r0, #52] +#else + strd r6, r7, [r0, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [r0, #88] + str r9, [r0, #92] +#else + strd r8, r9, [r0, #88] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [r0, #128] + str r11, [r0, #132] +#else + strd r10, r11, [r0, #128] +#endif + str r12, [r0, #168] + str lr, [r0, #172] + # Calc t[3] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #16] + ldr r3, [sp, #20] +#else + ldrd r2, r3, [sp, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #32] + ldr r5, [sp, #36] +#else + ldrd r4, r5, [sp, #32] +#endif + eor r2, r2, r5, lsr #31 + eor r3, r3, r4, lsr #31 + eor r2, r2, r4, lsl #1 + eor r3, r3, r5, lsl #1 + # XOR t[3] into s[x*5+3] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #24] + ldr r5, [r0, #28] +#else + ldrd r4, r5, [r0, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #64] + ldr r7, [r0, #68] +#else + ldrd r6, r7, [r0, #64] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #104] + ldr r9, [r0, #108] +#else + ldrd r8, r9, [r0, #104] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #144] + ldr r11, [r0, #148] +#else + ldrd r10, r11, [r0, #144] +#endif + ldr r12, [r0, #184] + ldr lr, [r0, #188] + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 + eor r12, r12, r2 + eor lr, lr, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [r0, #24] + str r5, [r0, #28] +#else + strd r4, r5, [r0, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [r0, #64] + str r7, [r0, #68] +#else + strd r6, r7, [r0, #64] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [r0, #104] + str r9, [r0, #108] +#else + strd r8, r9, [r0, #104] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [r0, #144] + str r11, [r0, #148] +#else + strd r10, r11, [r0, #144] +#endif + str r12, [r0, #184] + str lr, [r0, #188] + # Calc t[4] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #24] + ldr r3, [sp, #28] +#else + ldrd r2, r3, [sp, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp] + ldr r5, [sp, #4] +#else + ldrd r4, r5, [sp] +#endif + eor r2, r2, r5, lsr #31 + eor r3, r3, r4, lsr #31 + eor r2, r2, r4, lsl #1 + eor r3, r3, r5, lsl #1 + # XOR t[4] into s[x*5+4] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #32] + ldr r5, [r0, #36] +#else + ldrd r4, r5, [r0, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #72] + ldr r7, [r0, #76] +#else + ldrd r6, r7, [r0, #72] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #112] + ldr r9, [r0, #116] +#else + ldrd r8, r9, [r0, #112] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #152] + ldr r11, [r0, #156] +#else + ldrd r10, r11, [r0, #152] +#endif + ldr r12, [r0, #192] + ldr lr, [r0, #196] + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 + eor r12, r12, r2 + eor lr, lr, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [r0, #32] + str r5, [r0, #36] +#else + strd r4, r5, [r0, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [r0, #72] + str r7, [r0, #76] +#else + strd r6, r7, [r0, #72] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [r0, #112] + str r9, [r0, #116] +#else + strd r8, r9, [r0, #112] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [r0, #152] + str r11, [r0, #156] +#else + strd r10, r11, [r0, #152] +#endif + str r12, [r0, #192] + str lr, [r0, #196] + # Row Mix + # Row 0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0] + ldr r3, [r0, #4] +#else + ldrd r2, r3, [r0] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #48] + ldr r5, [r0, #52] +#else + ldrd r4, r5, [r0, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #96] + ldr r7, [r0, #100] +#else + ldrd r6, r7, [r0, #96] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #144] + ldr r9, [r0, #148] +#else + ldrd r8, r9, [r0, #144] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #192] + ldr r11, [r0, #196] +#else + ldrd r10, r11, [r0, #192] +#endif + # s[1] <<< 44 + mov lr, r4 + lsr r12, r5, #20 + lsr r4, r4, #20 + orr r4, r4, r5, lsl #12 + orr r5, r12, lr, lsl #12 + # s[2] <<< 43 + mov lr, r6 + lsr r12, r7, #21 + lsr r6, r6, #21 + orr r6, r6, r7, lsl #11 + orr r7, r12, lr, lsl #11 + # s[3] <<< 21 + lsr r12, r9, #11 + lsr lr, r8, #11 + orr r8, r12, r8, lsl #21 + orr r9, lr, r9, lsl #21 + # s[4] <<< 14 + lsr r12, r11, #18 + lsr lr, r10, #18 + orr r10, r12, r10, lsl #14 + orr r11, lr, r11, lsl #14 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [sp, #8] + str lr, [sp, #12] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [sp, #16] + str lr, [sp, #20] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [sp, #24] + str lr, [sp, #28] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [sp, #32] + str lr, [sp, #36] + # Get constant +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r1] + ldr r11, [r1, #4] +#else + ldrd r10, r11, [r1] +#endif + add r1, r1, #8 + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + # XOR in constant + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [sp] + str lr, [sp, #4] + # Row 1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0, #24] + ldr r3, [r0, #28] +#else + ldrd r2, r3, [r0, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #72] + ldr r5, [r0, #76] +#else + ldrd r4, r5, [r0, #72] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #80] + ldr r7, [r0, #84] +#else + ldrd r6, r7, [r0, #80] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #128] + ldr r9, [r0, #132] +#else + ldrd r8, r9, [r0, #128] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #176] + ldr r11, [r0, #180] +#else + ldrd r10, r11, [r0, #176] +#endif + # s[0] <<< 28 + lsr r12, r3, #4 + lsr lr, r2, #4 + orr r2, r12, r2, lsl #28 + orr r3, lr, r3, lsl #28 + # s[1] <<< 20 + lsr r12, r5, #12 + lsr lr, r4, #12 + orr r4, r12, r4, lsl #20 + orr r5, lr, r5, lsl #20 + # s[2] <<< 3 + lsr r12, r7, #29 + lsr lr, r6, #29 + orr r6, r12, r6, lsl #3 + orr r7, lr, r7, lsl #3 + # s[3] <<< 45 + mov lr, r8 + lsr r12, r9, #19 + lsr r8, r8, #19 + orr r8, r8, r9, lsl #13 + orr r9, r12, lr, lsl #13 + # s[4] <<< 61 + mov lr, r10 + lsr r12, r11, #3 + lsr r10, r10, #3 + orr r10, r10, r11, lsl #29 + orr r11, r12, lr, lsl #29 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [sp, #48] + str lr, [sp, #52] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [sp, #56] + str lr, [sp, #60] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [sp, #64] + str lr, [sp, #68] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [sp, #72] + str lr, [sp, #76] + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + str r12, [sp, #40] + str lr, [sp, #44] + # Row 2 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0, #8] + ldr r3, [r0, #12] +#else + ldrd r2, r3, [r0, #8] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #56] + ldr r5, [r0, #60] +#else + ldrd r4, r5, [r0, #56] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #104] + ldr r7, [r0, #108] +#else + ldrd r6, r7, [r0, #104] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #152] + ldr r9, [r0, #156] +#else + ldrd r8, r9, [r0, #152] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #160] + ldr r11, [r0, #164] +#else + ldrd r10, r11, [r0, #160] +#endif + # s[0] <<< 1 + lsr r12, r3, #31 + lsr lr, r2, #31 + orr r2, r12, r2, lsl #1 + orr r3, lr, r3, lsl #1 + # s[1] <<< 6 + lsr r12, r5, #26 + lsr lr, r4, #26 + orr r4, r12, r4, lsl #6 + orr r5, lr, r5, lsl #6 + # s[2] <<< 25 + lsr r12, r7, #7 + lsr lr, r6, #7 + orr r6, r12, r6, lsl #25 + orr r7, lr, r7, lsl #25 + # s[3] <<< 8 + lsr r12, r9, #24 + lsr lr, r8, #24 + orr r8, r12, r8, lsl #8 + orr r9, lr, r9, lsl #8 + # s[4] <<< 18 + lsr r12, r11, #14 + lsr lr, r10, #14 + orr r10, r12, r10, lsl #18 + orr r11, lr, r11, lsl #18 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [sp, #88] + str lr, [sp, #92] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [sp, #96] + str lr, [sp, #100] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [sp, #104] + str lr, [sp, #108] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [sp, #112] + str lr, [sp, #116] + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + str r12, [sp, #80] + str lr, [sp, #84] + # Row 3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0, #32] + ldr r3, [r0, #36] +#else + ldrd r2, r3, [r0, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #40] + ldr r5, [r0, #44] +#else + ldrd r4, r5, [r0, #40] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #88] + ldr r7, [r0, #92] +#else + ldrd r6, r7, [r0, #88] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #136] + ldr r9, [r0, #140] +#else + ldrd r8, r9, [r0, #136] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #184] + ldr r11, [r0, #188] +#else + ldrd r10, r11, [r0, #184] +#endif + # s[0] <<< 27 + lsr r12, r3, #5 + lsr lr, r2, #5 + orr r2, r12, r2, lsl #27 + orr r3, lr, r3, lsl #27 + # s[1] <<< 36 + mov lr, r4 + lsr r12, r5, #28 + lsr r4, r4, #28 + orr r4, r4, r5, lsl #4 + orr r5, r12, lr, lsl #4 + # s[2] <<< 10 + lsr r12, r7, #22 + lsr lr, r6, #22 + orr r6, r12, r6, lsl #10 + orr r7, lr, r7, lsl #10 + # s[3] <<< 15 + lsr r12, r9, #17 + lsr lr, r8, #17 + orr r8, r12, r8, lsl #15 + orr r9, lr, r9, lsl #15 + # s[4] <<< 56 + mov lr, r10 + lsr r12, r11, #8 + lsr r10, r10, #8 + orr r10, r10, r11, lsl #24 + orr r11, r12, lr, lsl #24 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [sp, #128] + str lr, [sp, #132] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [sp, #136] + str lr, [sp, #140] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [sp, #144] + str lr, [sp, #148] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [sp, #152] + str lr, [sp, #156] + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + str r12, [sp, #120] + str lr, [sp, #124] + # Row 4 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0, #16] + ldr r3, [r0, #20] +#else + ldrd r2, r3, [r0, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #64] + ldr r5, [r0, #68] +#else + ldrd r4, r5, [r0, #64] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [r0, #112] + ldr r7, [r0, #116] +#else + ldrd r6, r7, [r0, #112] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [r0, #120] + ldr r9, [r0, #124] +#else + ldrd r8, r9, [r0, #120] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r0, #168] + ldr r11, [r0, #172] +#else + ldrd r10, r11, [r0, #168] +#endif + # s[0] <<< 62 + mov lr, r2 + lsr r12, r3, #2 + lsr r2, r2, #2 + orr r2, r2, r3, lsl #30 + orr r3, r12, lr, lsl #30 + # s[1] <<< 55 + mov lr, r4 + lsr r12, r5, #9 + lsr r4, r4, #9 + orr r4, r4, r5, lsl #23 + orr r5, r12, lr, lsl #23 + # s[2] <<< 39 + mov lr, r6 + lsr r12, r7, #25 + lsr r6, r6, #25 + orr r6, r6, r7, lsl #7 + orr r7, r12, lr, lsl #7 + # s[3] <<< 41 + mov lr, r8 + lsr r12, r9, #23 + lsr r8, r8, #23 + orr r8, r8, r9, lsl #9 + orr r9, r12, lr, lsl #9 + # s[4] <<< 2 + lsr r12, r11, #30 + lsr lr, r10, #30 + orr r10, r12, r10, lsl #2 + orr r11, lr, r11, lsl #2 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [sp, #168] + str lr, [sp, #172] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [sp, #176] + str lr, [sp, #180] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [sp, #184] + str lr, [sp, #188] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [sp, #192] + str lr, [sp, #196] + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + str r12, [sp, #160] + str lr, [sp, #164] + # Round odd + # Calc b[4] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #32] + ldr r5, [sp, #36] +#else + ldrd r4, r5, [sp, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #72] + ldr r7, [sp, #76] +#else + ldrd r6, r7, [sp, #72] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #112] + ldr r9, [sp, #116] +#else + ldrd r8, r9, [sp, #112] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #152] + ldr r11, [sp, #156] +#else + ldrd r10, r11, [sp, #152] +#endif + ldr r12, [sp, #192] + ldr lr, [sp, #196] + eor r2, r4, r6 + eor r3, r5, r7 + eor r2, r2, r8 + eor r3, r3, r9 + eor r2, r2, r10 + eor r3, r3, r11 + eor r2, r2, r12 + eor r3, r3, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r2, [r0, #32] + str r3, [r0, #36] +#else + strd r2, r3, [r0, #32] +#endif + # Calc b[1] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #8] + ldr r5, [sp, #12] +#else + ldrd r4, r5, [sp, #8] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #48] + ldr r7, [sp, #52] +#else + ldrd r6, r7, [sp, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #88] + ldr r9, [sp, #92] +#else + ldrd r8, r9, [sp, #88] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #128] + ldr r11, [sp, #132] +#else + ldrd r10, r11, [sp, #128] +#endif + ldr r12, [sp, #168] + ldr lr, [sp, #172] + eor r4, r4, r6 + eor r5, r5, r7 + eor r4, r4, r8 + eor r5, r5, r9 + eor r4, r4, r10 + eor r5, r5, r11 + eor r4, r4, r12 + eor r5, r5, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [r0, #8] + str r5, [r0, #12] +#else + strd r4, r5, [r0, #8] +#endif + # Calc t[0] + eor r2, r2, r5, lsr #31 + eor r3, r3, r4, lsr #31 + eor r2, r2, r4, lsl #1 + eor r3, r3, r5, lsl #1 + # Calc b[0] and XOR t[0] into s[x*5+0] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp] + ldr r5, [sp, #4] +#else + ldrd r4, r5, [sp] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #40] + ldr r7, [sp, #44] +#else + ldrd r6, r7, [sp, #40] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #80] + ldr r9, [sp, #84] +#else + ldrd r8, r9, [sp, #80] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #120] + ldr r11, [sp, #124] +#else + ldrd r10, r11, [sp, #120] +#endif + eor r12, r4, r6 + eor lr, r5, r7 + eor r12, r12, r8 + eor lr, lr, r9 + eor r12, r12, r10 + eor lr, lr, r11 + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp] + str r5, [sp, #4] +#else + strd r4, r5, [sp] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #40] + str r7, [sp, #44] +#else + strd r6, r7, [sp, #40] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [sp, #80] + str r9, [sp, #84] +#else + strd r8, r9, [sp, #80] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [sp, #120] + str r11, [sp, #124] +#else + strd r10, r11, [sp, #120] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #160] + ldr r11, [sp, #164] +#else + ldrd r10, r11, [sp, #160] +#endif + eor r12, r12, r10 + eor lr, lr, r11 + eor r10, r10, r2 + eor r11, r11, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [sp, #160] + str r11, [sp, #164] +#else + strd r10, r11, [sp, #160] +#endif + str r12, [r0] + str lr, [r0, #4] + # Calc b[3] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #24] + ldr r5, [sp, #28] +#else + ldrd r4, r5, [sp, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #64] + ldr r7, [sp, #68] +#else + ldrd r6, r7, [sp, #64] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #104] + ldr r9, [sp, #108] +#else + ldrd r8, r9, [sp, #104] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #144] + ldr r11, [sp, #148] +#else + ldrd r10, r11, [sp, #144] +#endif + ldr r12, [sp, #184] + ldr lr, [sp, #188] + eor r4, r4, r6 + eor r5, r5, r7 + eor r4, r4, r8 + eor r5, r5, r9 + eor r4, r4, r10 + eor r5, r5, r11 + eor r4, r4, r12 + eor r5, r5, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [r0, #24] + str r5, [r0, #28] +#else + strd r4, r5, [r0, #24] +#endif + # Calc t[2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0, #8] + ldr r3, [r0, #12] +#else + ldrd r2, r3, [r0, #8] +#endif + eor r2, r2, r5, lsr #31 + eor r3, r3, r4, lsr #31 + eor r2, r2, r4, lsl #1 + eor r3, r3, r5, lsl #1 + # Calc b[2] and XOR t[2] into s[x*5+2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #16] + ldr r5, [sp, #20] +#else + ldrd r4, r5, [sp, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #56] + ldr r7, [sp, #60] +#else + ldrd r6, r7, [sp, #56] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #96] + ldr r9, [sp, #100] +#else + ldrd r8, r9, [sp, #96] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #136] + ldr r11, [sp, #140] +#else + ldrd r10, r11, [sp, #136] +#endif + eor r12, r4, r6 + eor lr, r5, r7 + eor r12, r12, r8 + eor lr, lr, r9 + eor r12, r12, r10 + eor lr, lr, r11 + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #16] + str r5, [sp, #20] +#else + strd r4, r5, [sp, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #56] + str r7, [sp, #60] +#else + strd r6, r7, [sp, #56] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [sp, #96] + str r9, [sp, #100] +#else + strd r8, r9, [sp, #96] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [sp, #136] + str r11, [sp, #140] +#else + strd r10, r11, [sp, #136] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #176] + ldr r11, [sp, #180] +#else + ldrd r10, r11, [sp, #176] +#endif + eor r12, r12, r10 + eor lr, lr, r11 + eor r10, r10, r2 + eor r11, r11, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [sp, #176] + str r11, [sp, #180] +#else + strd r10, r11, [sp, #176] +#endif + str r12, [r0, #16] + str lr, [r0, #20] + # Calc t[1] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0] + ldr r3, [r0, #4] +#else + ldrd r2, r3, [r0] +#endif + eor r2, r2, lr, lsr #31 + eor r3, r3, r12, lsr #31 + eor r2, r2, r12, lsl #1 + eor r3, r3, lr, lsl #1 + # XOR t[1] into s[x*5+1] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #8] + ldr r5, [sp, #12] +#else + ldrd r4, r5, [sp, #8] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #48] + ldr r7, [sp, #52] +#else + ldrd r6, r7, [sp, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #88] + ldr r9, [sp, #92] +#else + ldrd r8, r9, [sp, #88] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #128] + ldr r11, [sp, #132] +#else + ldrd r10, r11, [sp, #128] +#endif + ldr r12, [sp, #168] + ldr lr, [sp, #172] + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 + eor r12, r12, r2 + eor lr, lr, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #8] + str r5, [sp, #12] +#else + strd r4, r5, [sp, #8] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #48] + str r7, [sp, #52] +#else + strd r6, r7, [sp, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [sp, #88] + str r9, [sp, #92] +#else + strd r8, r9, [sp, #88] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [sp, #128] + str r11, [sp, #132] +#else + strd r10, r11, [sp, #128] +#endif + str r12, [sp, #168] + str lr, [sp, #172] + # Calc t[3] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0, #16] + ldr r3, [r0, #20] +#else + ldrd r2, r3, [r0, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0, #32] + ldr r5, [r0, #36] +#else + ldrd r4, r5, [r0, #32] +#endif + eor r2, r2, r5, lsr #31 + eor r3, r3, r4, lsr #31 + eor r2, r2, r4, lsl #1 + eor r3, r3, r5, lsl #1 + # XOR t[3] into s[x*5+3] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #24] + ldr r5, [sp, #28] +#else + ldrd r4, r5, [sp, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #64] + ldr r7, [sp, #68] +#else + ldrd r6, r7, [sp, #64] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #104] + ldr r9, [sp, #108] +#else + ldrd r8, r9, [sp, #104] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #144] + ldr r11, [sp, #148] +#else + ldrd r10, r11, [sp, #144] +#endif + ldr r12, [sp, #184] + ldr lr, [sp, #188] + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 + eor r12, r12, r2 + eor lr, lr, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #24] + str r5, [sp, #28] +#else + strd r4, r5, [sp, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #64] + str r7, [sp, #68] +#else + strd r6, r7, [sp, #64] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [sp, #104] + str r9, [sp, #108] +#else + strd r8, r9, [sp, #104] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [sp, #144] + str r11, [sp, #148] +#else + strd r10, r11, [sp, #144] +#endif + str r12, [sp, #184] + str lr, [sp, #188] + # Calc t[4] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [r0, #24] + ldr r3, [r0, #28] +#else + ldrd r2, r3, [r0, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0] + ldr r5, [r0, #4] +#else + ldrd r4, r5, [r0] +#endif + eor r2, r2, r5, lsr #31 + eor r3, r3, r4, lsr #31 + eor r2, r2, r4, lsl #1 + eor r3, r3, r5, lsl #1 + # XOR t[4] into s[x*5+4] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #32] + ldr r5, [sp, #36] +#else + ldrd r4, r5, [sp, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #72] + ldr r7, [sp, #76] +#else + ldrd r6, r7, [sp, #72] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #112] + ldr r9, [sp, #116] +#else + ldrd r8, r9, [sp, #112] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #152] + ldr r11, [sp, #156] +#else + ldrd r10, r11, [sp, #152] +#endif + ldr r12, [sp, #192] + ldr lr, [sp, #196] + eor r4, r4, r2 + eor r5, r5, r3 + eor r6, r6, r2 + eor r7, r7, r3 + eor r8, r8, r2 + eor r9, r9, r3 + eor r10, r10, r2 + eor r11, r11, r3 + eor r12, r12, r2 + eor lr, lr, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #32] + str r5, [sp, #36] +#else + strd r4, r5, [sp, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #72] + str r7, [sp, #76] +#else + strd r6, r7, [sp, #72] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r8, [sp, #112] + str r9, [sp, #116] +#else + strd r8, r9, [sp, #112] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r10, [sp, #152] + str r11, [sp, #156] +#else + strd r10, r11, [sp, #152] +#endif + str r12, [sp, #192] + str lr, [sp, #196] + # Row Mix + # Row 0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp] + ldr r3, [sp, #4] +#else + ldrd r2, r3, [sp] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #48] + ldr r5, [sp, #52] +#else + ldrd r4, r5, [sp, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #96] + ldr r7, [sp, #100] +#else + ldrd r6, r7, [sp, #96] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #144] + ldr r9, [sp, #148] +#else + ldrd r8, r9, [sp, #144] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #192] + ldr r11, [sp, #196] +#else + ldrd r10, r11, [sp, #192] +#endif + # s[1] <<< 44 + mov lr, r4 + lsr r12, r5, #20 + lsr r4, r4, #20 + orr r4, r4, r5, lsl #12 + orr r5, r12, lr, lsl #12 + # s[2] <<< 43 + mov lr, r6 + lsr r12, r7, #21 + lsr r6, r6, #21 + orr r6, r6, r7, lsl #11 + orr r7, r12, lr, lsl #11 + # s[3] <<< 21 + lsr r12, r9, #11 + lsr lr, r8, #11 + orr r8, r12, r8, lsl #21 + orr r9, lr, r9, lsl #21 + # s[4] <<< 14 + lsr r12, r11, #18 + lsr lr, r10, #18 + orr r10, r12, r10, lsl #14 + orr r11, lr, r11, lsl #14 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [r0, #16] + str lr, [r0, #20] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [r0, #24] + str lr, [r0, #28] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [r0, #32] + str lr, [r0, #36] + # Get constant +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [r1] + ldr r11, [r1, #4] +#else + ldrd r10, r11, [r1] +#endif + add r1, r1, #8 + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + # XOR in constant + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [r0] + str lr, [r0, #4] + # Row 1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #24] + ldr r3, [sp, #28] +#else + ldrd r2, r3, [sp, #24] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #72] + ldr r5, [sp, #76] +#else + ldrd r4, r5, [sp, #72] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #80] + ldr r7, [sp, #84] +#else + ldrd r6, r7, [sp, #80] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #128] + ldr r9, [sp, #132] +#else + ldrd r8, r9, [sp, #128] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #176] + ldr r11, [sp, #180] +#else + ldrd r10, r11, [sp, #176] +#endif + # s[0] <<< 28 + lsr r12, r3, #4 + lsr lr, r2, #4 + orr r2, r12, r2, lsl #28 + orr r3, lr, r3, lsl #28 + # s[1] <<< 20 + lsr r12, r5, #12 + lsr lr, r4, #12 + orr r4, r12, r4, lsl #20 + orr r5, lr, r5, lsl #20 + # s[2] <<< 3 + lsr r12, r7, #29 + lsr lr, r6, #29 + orr r6, r12, r6, lsl #3 + orr r7, lr, r7, lsl #3 + # s[3] <<< 45 + mov lr, r8 + lsr r12, r9, #19 + lsr r8, r8, #19 + orr r8, r8, r9, lsl #13 + orr r9, r12, lr, lsl #13 + # s[4] <<< 61 + mov lr, r10 + lsr r12, r11, #3 + lsr r10, r10, #3 + orr r10, r10, r11, lsl #29 + orr r11, r12, lr, lsl #29 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [r0, #56] + str lr, [r0, #60] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [r0, #64] + str lr, [r0, #68] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [r0, #72] + str lr, [r0, #76] + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + str r12, [r0, #40] + str lr, [r0, #44] + # Row 2 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #8] + ldr r3, [sp, #12] +#else + ldrd r2, r3, [sp, #8] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #56] + ldr r5, [sp, #60] +#else + ldrd r4, r5, [sp, #56] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #104] + ldr r7, [sp, #108] +#else + ldrd r6, r7, [sp, #104] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #152] + ldr r9, [sp, #156] +#else + ldrd r8, r9, [sp, #152] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #160] + ldr r11, [sp, #164] +#else + ldrd r10, r11, [sp, #160] +#endif + # s[0] <<< 1 + lsr r12, r3, #31 + lsr lr, r2, #31 + orr r2, r12, r2, lsl #1 + orr r3, lr, r3, lsl #1 + # s[1] <<< 6 + lsr r12, r5, #26 + lsr lr, r4, #26 + orr r4, r12, r4, lsl #6 + orr r5, lr, r5, lsl #6 + # s[2] <<< 25 + lsr r12, r7, #7 + lsr lr, r6, #7 + orr r6, r12, r6, lsl #25 + orr r7, lr, r7, lsl #25 + # s[3] <<< 8 + lsr r12, r9, #24 + lsr lr, r8, #24 + orr r8, r12, r8, lsl #8 + orr r9, lr, r9, lsl #8 + # s[4] <<< 18 + lsr r12, r11, #14 + lsr lr, r10, #14 + orr r10, r12, r10, lsl #18 + orr r11, lr, r11, lsl #18 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [r0, #88] + str lr, [r0, #92] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [r0, #96] + str lr, [r0, #100] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [r0, #104] + str lr, [r0, #108] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [r0, #112] + str lr, [r0, #116] + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + str r12, [r0, #80] + str lr, [r0, #84] + # Row 3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #32] + ldr r3, [sp, #36] +#else + ldrd r2, r3, [sp, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #40] + ldr r5, [sp, #44] +#else + ldrd r4, r5, [sp, #40] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #88] + ldr r7, [sp, #92] +#else + ldrd r6, r7, [sp, #88] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #136] + ldr r9, [sp, #140] +#else + ldrd r8, r9, [sp, #136] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #184] + ldr r11, [sp, #188] +#else + ldrd r10, r11, [sp, #184] +#endif + # s[0] <<< 27 + lsr r12, r3, #5 + lsr lr, r2, #5 + orr r2, r12, r2, lsl #27 + orr r3, lr, r3, lsl #27 + # s[1] <<< 36 + mov lr, r4 + lsr r12, r5, #28 + lsr r4, r4, #28 + orr r4, r4, r5, lsl #4 + orr r5, r12, lr, lsl #4 + # s[2] <<< 10 + lsr r12, r7, #22 + lsr lr, r6, #22 + orr r6, r12, r6, lsl #10 + orr r7, lr, r7, lsl #10 + # s[3] <<< 15 + lsr r12, r9, #17 + lsr lr, r8, #17 + orr r8, r12, r8, lsl #15 + orr r9, lr, r9, lsl #15 + # s[4] <<< 56 + mov lr, r10 + lsr r12, r11, #8 + lsr r10, r10, #8 + orr r10, r10, r11, lsl #24 + orr r11, r12, lr, lsl #24 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [r0, #128] + str lr, [r0, #132] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [r0, #136] + str lr, [r0, #140] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [r0, #144] + str lr, [r0, #148] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [r0, #152] + str lr, [r0, #156] + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + str r12, [r0, #120] + str lr, [r0, #124] + # Row 4 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r2, [sp, #16] + ldr r3, [sp, #20] +#else + ldrd r2, r3, [sp, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [sp, #64] + ldr r5, [sp, #68] +#else + ldrd r4, r5, [sp, #64] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r6, [sp, #112] + ldr r7, [sp, #116] +#else + ldrd r6, r7, [sp, #112] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r8, [sp, #120] + ldr r9, [sp, #124] +#else + ldrd r8, r9, [sp, #120] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r10, [sp, #168] + ldr r11, [sp, #172] +#else + ldrd r10, r11, [sp, #168] +#endif + # s[0] <<< 62 + mov lr, r2 + lsr r12, r3, #2 + lsr r2, r2, #2 + orr r2, r2, r3, lsl #30 + orr r3, r12, lr, lsl #30 + # s[1] <<< 55 + mov lr, r4 + lsr r12, r5, #9 + lsr r4, r4, #9 + orr r4, r4, r5, lsl #23 + orr r5, r12, lr, lsl #23 + # s[2] <<< 39 + mov lr, r6 + lsr r12, r7, #25 + lsr r6, r6, #25 + orr r6, r6, r7, lsl #7 + orr r7, r12, lr, lsl #7 + # s[3] <<< 41 + mov lr, r8 + lsr r12, r9, #23 + lsr r8, r8, #23 + orr r8, r8, r9, lsl #9 + orr r9, r12, lr, lsl #9 + # s[4] <<< 2 + lsr r12, r11, #30 + lsr lr, r10, #30 + orr r10, r12, r10, lsl #2 + orr r11, lr, r11, lsl #2 + bic r12, r8, r6 + bic lr, r9, r7 + eor r12, r12, r4 + eor lr, lr, r5 + str r12, [r0, #168] + str lr, [r0, #172] + bic r12, r10, r8 + bic lr, r11, r9 + eor r12, r12, r6 + eor lr, lr, r7 + str r12, [r0, #176] + str lr, [r0, #180] + bic r12, r2, r10 + bic lr, r3, r11 + eor r12, r12, r8 + eor lr, lr, r9 + str r12, [r0, #184] + str lr, [r0, #188] + bic r12, r4, r2 + bic lr, r5, r3 + eor r12, r12, r10 + eor lr, lr, r11 + str r12, [r0, #192] + str lr, [r0, #196] + bic r12, r6, r4 + bic lr, r7, r5 + eor r12, r12, r2 + eor lr, lr, r3 + str r12, [r0, #160] + str lr, [r0, #164] + ldr r2, [sp, #200] + subs r2, r2, #1 + bne L_sha3_arm32_begin + add sp, sp, #0xcc + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size BlockSha3,.-BlockSha3 +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA3 */ +#endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c new file mode 100644 index 000000000..9a21c4d8f --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-sha3-asm_c.c @@ -0,0 +1,2357 @@ +/* armv8-32-sha3-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha3/sha3.rb arm32 \ + * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha3-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef WOLFSSL_SHA3 +#ifndef WOLFSSL_ARMASM_NO_NEON +static const word64 L_sha3_arm2_neon_rt[] = { + 0x0000000000000001UL, 0x0000000000008082UL, + 0x800000000000808aUL, 0x8000000080008000UL, + 0x000000000000808bUL, 0x0000000080000001UL, + 0x8000000080008081UL, 0x8000000000008009UL, + 0x000000000000008aUL, 0x0000000000000088UL, + 0x0000000080008009UL, 0x000000008000000aUL, + 0x000000008000808bUL, 0x800000000000008bUL, + 0x8000000000008089UL, 0x8000000000008003UL, + 0x8000000000008002UL, 0x8000000000000080UL, + 0x000000000000800aUL, 0x800000008000000aUL, + 0x8000000080008081UL, 0x8000000000008080UL, + 0x0000000080000001UL, 0x8000000080008008UL, +}; + +#include + +void BlockSha3(word64* state_p) +{ + register word64* state asm ("r0") = (word64*)state_p; + register word64* L_sha3_arm2_neon_rt_c asm ("r1") = + (word64*)&L_sha3_arm2_neon_rt; + + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "mov r2, #24\n\t" + "mov r3, sp\n\t" + "vld1.8 {d0-d3}, [%[state]]!\n\t" + "vld1.8 {d4-d7}, [%[state]]!\n\t" + "vld1.8 {d8-d11}, [%[state]]!\n\t" + "vld1.8 {d12-d15}, [%[state]]!\n\t" + "vld1.8 {d16-d19}, [%[state]]!\n\t" + "vld1.8 {d20-d23}, [%[state]]!\n\t" + "vld1.8 {d24}, [%[state]]\n\t" + "sub %[state], %[state], #0xc0\n\t" + "\n" + "L_sha3_arm32_neon_begin_%=: \n\t" + /* Calc b[0..4] */ + "veor d26, d0, d5\n\t" + "veor d27, d1, d6\n\t" + "veor d28, d2, d7\n\t" + "veor d29, d3, d8\n\t" + "veor d25, d4, d9\n\t" + "veor d26, d26, d10\n\t" + "veor d27, d27, d11\n\t" + "veor d28, d28, d12\n\t" + "veor d29, d29, d13\n\t" + "veor d25, d25, d14\n\t" + "veor d26, d26, d15\n\t" + "veor d27, d27, d16\n\t" + "veor d28, d28, d17\n\t" + "veor d29, d29, d18\n\t" + "veor d25, d25, d19\n\t" + "veor d26, d26, d20\n\t" + "veor d27, d27, d21\n\t" + "veor d28, d28, d22\n\t" + "veor d29, d29, d23\n\t" + "veor d25, d25, d24\n\t" + "vst1.8 {d25-d26}, [r3]\n\t" + /* Calc t[0..4] and XOR into s[i*5..i*5+4] */ + /* t[0] */ + "vshr.u64 d30, d27, #63\n\t" + "vshl.u64 d31, d27, #1\n\t" + "veor d25, d25, d30\n\t" + "veor d25, d25, d31\n\t" + /* t[1] */ + "vshr.u64 d30, d28, #63\n\t" + "vshl.u64 d31, d28, #1\n\t" + "veor d26, d26, d30\n\t" + "veor d26, d26, d31\n\t" + /* t[2] */ + "vshr.u64 d30, d29, #63\n\t" + "vshl.u64 d31, d29, #1\n\t" + "veor d27, d27, d30\n\t" + "veor d27, d27, d31\n\t" + /* t[3] */ + "vldr.8 d31, [r3]\n\t" + "vshr.u64 d30, d31, #63\n\t" + "vshl.u64 d31, d31, #1\n\t" + "veor d28, d28, d30\n\t" + "veor d28, d28, d31\n\t" + /* t[4] */ + "vldr.8 d31, [r3, #8]\n\t" + "vshr.u64 d30, d31, #63\n\t" + "vshl.u64 d31, d31, #1\n\t" + "veor d29, d29, d30\n\t" + "veor d29, d29, d31\n\t" + "sub r3, r3, #16\n\t" + "veor d0, d0, d25\n\t" + /* s[1] => s[10] (tmp) */ + "veor d30, d1, d26\n\t" + "vshr.u64 d31, d30, #63\n\t" + "vshl.u64 d30, d30, #1\n\t" + "veor d30, d30, d31\n\t" + /* s[6] => s[1] */ + "veor d1, d6, d26\n\t" + "vshr.u64 d31, d1, #20\n\t" + "vshl.u64 d1, d1, #44\n\t" + "veor d1, d1, d31\n\t" + /* s[9] => s[6] */ + "veor d6, d9, d29\n\t" + "vshr.u64 d31, d6, #44\n\t" + "vshl.u64 d6, d6, #20\n\t" + "veor d6, d6, d31\n\t" + /* s[22] => s[9] */ + "veor d9, d22, d27\n\t" + "vshr.u64 d31, d9, #3\n\t" + "vshl.u64 d9, d9, #61\n\t" + "veor d9, d9, d31\n\t" + /* s[14] => s[22] */ + "veor d22, d14, d29\n\t" + "vshr.u64 d31, d22, #25\n\t" + "vshl.u64 d22, d22, #39\n\t" + "veor d22, d22, d31\n\t" + /* s[20] => s[14] */ + "veor d14, d20, d25\n\t" + "vshr.u64 d31, d14, #46\n\t" + "vshl.u64 d14, d14, #18\n\t" + "veor d14, d14, d31\n\t" + /* s[2] => s[20] */ + "veor d20, d2, d27\n\t" + "vshr.u64 d31, d20, #2\n\t" + "vshl.u64 d20, d20, #62\n\t" + "veor d20, d20, d31\n\t" + /* s[12] => s[2] */ + "veor d2, d12, d27\n\t" + "vshr.u64 d31, d2, #21\n\t" + "vshl.u64 d2, d2, #43\n\t" + "veor d2, d2, d31\n\t" + /* s[13] => s[12] */ + "veor d12, d13, d28\n\t" + "vshr.u64 d31, d12, #39\n\t" + "vshl.u64 d12, d12, #25\n\t" + "veor d12, d12, d31\n\t" + /* s[19] => s[13] */ + "veor d13, d19, d29\n\t" + "vshr.u64 d31, d13, #56\n\t" + "vshl.u64 d13, d13, #8\n\t" + "veor d13, d13, d31\n\t" + /* s[23] => s[19] */ + "veor d19, d23, d28\n\t" + "vshr.u64 d31, d19, #8\n\t" + "vshl.u64 d19, d19, #56\n\t" + "veor d19, d19, d31\n\t" + /* s[15] => s[23] */ + "veor d23, d15, d25\n\t" + "vshr.u64 d31, d23, #23\n\t" + "vshl.u64 d23, d23, #41\n\t" + "veor d23, d23, d31\n\t" + /* s[4] => s[15] */ + "veor d15, d4, d29\n\t" + "vshr.u64 d31, d15, #37\n\t" + "vshl.u64 d15, d15, #27\n\t" + "veor d15, d15, d31\n\t" + /* s[24] => s[4] */ + "veor d4, d24, d29\n\t" + "vshr.u64 d31, d4, #50\n\t" + "vshl.u64 d4, d4, #14\n\t" + "veor d4, d4, d31\n\t" + /* s[21] => s[24] */ + "veor d24, d21, d26\n\t" + "vshr.u64 d31, d24, #62\n\t" + "vshl.u64 d24, d24, #2\n\t" + "veor d24, d24, d31\n\t" + /* s[8] => s[21] */ + "veor d21, d8, d28\n\t" + "vshr.u64 d31, d21, #9\n\t" + "vshl.u64 d21, d21, #55\n\t" + "veor d21, d21, d31\n\t" + /* s[16] => s[8] */ + "veor d8, d16, d26\n\t" + "vshr.u64 d31, d8, #19\n\t" + "vshl.u64 d8, d8, #45\n\t" + "veor d8, d8, d31\n\t" + /* s[5] => s[16] */ + "veor d16, d5, d25\n\t" + "vshr.u64 d31, d16, #28\n\t" + "vshl.u64 d16, d16, #36\n\t" + "veor d16, d16, d31\n\t" + /* s[3] => s[5] */ + "veor d5, d3, d28\n\t" + "vshr.u64 d31, d5, #36\n\t" + "vshl.u64 d5, d5, #28\n\t" + "veor d5, d5, d31\n\t" + /* s[18] => s[3] */ + "veor d3, d18, d28\n\t" + "vshr.u64 d31, d3, #43\n\t" + "vshl.u64 d3, d3, #21\n\t" + "veor d3, d3, d31\n\t" + /* s[17] => s[18] */ + "veor d18, d17, d27\n\t" + "vshr.u64 d31, d18, #49\n\t" + "vshl.u64 d18, d18, #15\n\t" + "veor d18, d18, d31\n\t" + /* s[11] => s[17] */ + "veor d17, d11, d26\n\t" + "vshr.u64 d31, d17, #54\n\t" + "vshl.u64 d17, d17, #10\n\t" + "veor d17, d17, d31\n\t" + /* s[7] => s[11] */ + "veor d11, d7, d27\n\t" + "vshr.u64 d31, d11, #58\n\t" + "vshl.u64 d11, d11, #6\n\t" + "veor d11, d11, d31\n\t" + /* s[10] => s[7] */ + "veor d7, d10, d25\n\t" + "vshr.u64 d31, d7, #61\n\t" + "vshl.u64 d7, d7, #3\n\t" + "veor d7, d7, d31\n\t" + /* Row Mix */ + "vmov d25, d0\n\t" + "vmov d26, d1\n\t" + "vbic d31, d2, d26\n\t" + "veor d0, d25, d31\n\t" + "vbic d31, d3, d2\n\t" + "veor d1, d26, d31\n\t" + "vbic d31, d4, d3\n\t" + "veor d2, d2, d31\n\t" + "vbic d31, d25, d4\n\t" + "veor d3, d3, d31\n\t" + "vbic d31, d26, d25\n\t" + "veor d4, d4, d31\n\t" + "vmov d25, d5\n\t" + "vmov d26, d6\n\t" + "vbic d31, d7, d26\n\t" + "veor d5, d25, d31\n\t" + "vbic d31, d8, d7\n\t" + "veor d6, d26, d31\n\t" + "vbic d31, d9, d8\n\t" + "veor d7, d7, d31\n\t" + "vbic d31, d25, d9\n\t" + "veor d8, d8, d31\n\t" + "vbic d31, d26, d25\n\t" + "veor d9, d9, d31\n\t" + "vmov d26, d11\n\t" + "vbic d31, d12, d26\n\t" + "veor d10, d30, d31\n\t" + "vbic d31, d13, d12\n\t" + "veor d11, d26, d31\n\t" + "vbic d31, d14, d13\n\t" + "veor d12, d12, d31\n\t" + "vbic d31, d30, d14\n\t" + "veor d13, d13, d31\n\t" + "vbic d31, d26, d30\n\t" + "veor d14, d14, d31\n\t" + "vmov d25, d15\n\t" + "vmov d26, d16\n\t" + "vbic d31, d17, d26\n\t" + "veor d15, d25, d31\n\t" + "vbic d31, d18, d17\n\t" + "veor d16, d26, d31\n\t" + "vbic d31, d19, d18\n\t" + "veor d17, d17, d31\n\t" + "vbic d31, d25, d19\n\t" + "veor d18, d18, d31\n\t" + "vbic d31, d26, d25\n\t" + "veor d19, d19, d31\n\t" + "vmov d25, d20\n\t" + "vmov d26, d21\n\t" + "vbic d31, d22, d26\n\t" + "veor d20, d25, d31\n\t" + "vbic d31, d23, d22\n\t" + "veor d21, d26, d31\n\t" + "vbic d31, d24, d23\n\t" + "veor d22, d22, d31\n\t" + "vbic d31, d25, d24\n\t" + "veor d23, d23, d31\n\t" + "vbic d31, d26, d25\n\t" + "veor d24, d24, d31\n\t" + "vld1.8 {d30}, [r1]!\n\t" + "subs r2, r2, #1\n\t" + "veor d0, d0, d30\n\t" + "bne L_sha3_arm32_neon_begin_%=\n\t" + "vst1.8 {d0-d3}, [%[state]]!\n\t" + "vst1.8 {d4-d7}, [%[state]]!\n\t" + "vst1.8 {d8-d11}, [%[state]]!\n\t" + "vst1.8 {d12-d15}, [%[state]]!\n\t" + "vst1.8 {d16-d19}, [%[state]]!\n\t" + "vst1.8 {d20-d23}, [%[state]]!\n\t" + "vst1.8 {d24}, [%[state]]\n\t" + "add sp, sp, #16\n\t" + : [state] "+r" (state), + [L_sha3_arm2_neon_rt] "+r" (L_sha3_arm2_neon_rt_c) + : + : "memory", "cc", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", + "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", + "d26", "d27", "d28", "d29", "d30", "d31" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#ifdef WOLFSSL_ARMASM_NO_NEON +static const word64 L_sha3_arm2_rt[] = { + 0x0000000000000001UL, 0x0000000000008082UL, + 0x800000000000808aUL, 0x8000000080008000UL, + 0x000000000000808bUL, 0x0000000080000001UL, + 0x8000000080008081UL, 0x8000000000008009UL, + 0x000000000000008aUL, 0x0000000000000088UL, + 0x0000000080008009UL, 0x000000008000000aUL, + 0x000000008000808bUL, 0x800000000000008bUL, + 0x8000000000008089UL, 0x8000000000008003UL, + 0x8000000000008002UL, 0x8000000000000080UL, + 0x000000000000800aUL, 0x800000008000000aUL, + 0x8000000080008081UL, 0x8000000000008080UL, + 0x0000000080000001UL, 0x8000000080008008UL, +}; + +#include + +void BlockSha3(word64* state_p) +{ + register word64* state asm ("r0") = (word64*)state_p; + register word64* L_sha3_arm2_rt_c asm ("r1") = (word64*)&L_sha3_arm2_rt; + + __asm__ __volatile__ ( + "sub sp, sp, #0xcc\n\t" + "mov r2, #12\n\t" + "\n" + "L_sha3_arm32_begin_%=: \n\t" + "str r2, [sp, #200]\n\t" + /* Round even */ + /* Calc b[4] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #32]\n\t" + "ldr r5, [%[state], #36]\n\t" +#else + "ldrd r4, r5, [%[state], #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #72]\n\t" + "ldr r7, [%[state], #76]\n\t" +#else + "ldrd r6, r7, [%[state], #72]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #112]\n\t" + "ldr r9, [%[state], #116]\n\t" +#else + "ldrd r8, r9, [%[state], #112]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #152]\n\t" + "ldr r11, [%[state], #156]\n\t" +#else + "ldrd r10, r11, [%[state], #152]\n\t" +#endif + "ldr r12, [%[state], #192]\n\t" + "ldr lr, [%[state], #196]\n\t" + "eor r2, r4, r6\n\t" + "eor r3, r5, r7\n\t" + "eor r2, r2, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r2, r2, r10\n\t" + "eor r3, r3, r11\n\t" + "eor r2, r2, r12\n\t" + "eor r3, r3, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r2, [sp, #32]\n\t" + "str r3, [sp, #36]\n\t" +#else + "strd r2, r3, [sp, #32]\n\t" +#endif + /* Calc b[1] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #8]\n\t" + "ldr r5, [%[state], #12]\n\t" +#else + "ldrd r4, r5, [%[state], #8]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #48]\n\t" + "ldr r7, [%[state], #52]\n\t" +#else + "ldrd r6, r7, [%[state], #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #88]\n\t" + "ldr r9, [%[state], #92]\n\t" +#else + "ldrd r8, r9, [%[state], #88]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #128]\n\t" + "ldr r11, [%[state], #132]\n\t" +#else + "ldrd r10, r11, [%[state], #128]\n\t" +#endif + "ldr r12, [%[state], #168]\n\t" + "ldr lr, [%[state], #172]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r4, r4, r10\n\t" + "eor r5, r5, r11\n\t" + "eor r4, r4, r12\n\t" + "eor r5, r5, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #8]\n\t" + "str r5, [sp, #12]\n\t" +#else + "strd r4, r5, [sp, #8]\n\t" +#endif + /* Calc t[0] */ + "eor r2, r2, r5, lsr #31\n\t" + "eor r3, r3, r4, lsr #31\n\t" + "eor r2, r2, r4, lsl #1\n\t" + "eor r3, r3, r5, lsl #1\n\t" + /* Calc b[0] and XOR t[0] into s[x*5+0] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state]]\n\t" + "ldr r5, [%[state], #4]\n\t" +#else + "ldrd r4, r5, [%[state]]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #40]\n\t" + "ldr r7, [%[state], #44]\n\t" +#else + "ldrd r6, r7, [%[state], #40]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #80]\n\t" + "ldr r9, [%[state], #84]\n\t" +#else + "ldrd r8, r9, [%[state], #80]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #120]\n\t" + "ldr r11, [%[state], #124]\n\t" +#else + "ldrd r10, r11, [%[state], #120]\n\t" +#endif + "eor r12, r4, r6\n\t" + "eor lr, r5, r7\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [%[state]]\n\t" + "str r5, [%[state], #4]\n\t" +#else + "strd r4, r5, [%[state]]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [%[state], #40]\n\t" + "str r7, [%[state], #44]\n\t" +#else + "strd r6, r7, [%[state], #40]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [%[state], #80]\n\t" + "str r9, [%[state], #84]\n\t" +#else + "strd r8, r9, [%[state], #80]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [%[state], #120]\n\t" + "str r11, [%[state], #124]\n\t" +#else + "strd r10, r11, [%[state], #120]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #160]\n\t" + "ldr r11, [%[state], #164]\n\t" +#else + "ldrd r10, r11, [%[state], #160]\n\t" +#endif + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [%[state], #160]\n\t" + "str r11, [%[state], #164]\n\t" +#else + "strd r10, r11, [%[state], #160]\n\t" +#endif + "str r12, [sp]\n\t" + "str lr, [sp, #4]\n\t" + /* Calc b[3] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #24]\n\t" + "ldr r5, [%[state], #28]\n\t" +#else + "ldrd r4, r5, [%[state], #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #64]\n\t" + "ldr r7, [%[state], #68]\n\t" +#else + "ldrd r6, r7, [%[state], #64]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #104]\n\t" + "ldr r9, [%[state], #108]\n\t" +#else + "ldrd r8, r9, [%[state], #104]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #144]\n\t" + "ldr r11, [%[state], #148]\n\t" +#else + "ldrd r10, r11, [%[state], #144]\n\t" +#endif + "ldr r12, [%[state], #184]\n\t" + "ldr lr, [%[state], #188]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r4, r4, r10\n\t" + "eor r5, r5, r11\n\t" + "eor r4, r4, r12\n\t" + "eor r5, r5, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #24]\n\t" + "str r5, [sp, #28]\n\t" +#else + "strd r4, r5, [sp, #24]\n\t" +#endif + /* Calc t[2] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp, #8]\n\t" + "ldr r3, [sp, #12]\n\t" +#else + "ldrd r2, r3, [sp, #8]\n\t" +#endif + "eor r2, r2, r5, lsr #31\n\t" + "eor r3, r3, r4, lsr #31\n\t" + "eor r2, r2, r4, lsl #1\n\t" + "eor r3, r3, r5, lsl #1\n\t" + /* Calc b[2] and XOR t[2] into s[x*5+2] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #16]\n\t" + "ldr r5, [%[state], #20]\n\t" +#else + "ldrd r4, r5, [%[state], #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #56]\n\t" + "ldr r7, [%[state], #60]\n\t" +#else + "ldrd r6, r7, [%[state], #56]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #96]\n\t" + "ldr r9, [%[state], #100]\n\t" +#else + "ldrd r8, r9, [%[state], #96]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #136]\n\t" + "ldr r11, [%[state], #140]\n\t" +#else + "ldrd r10, r11, [%[state], #136]\n\t" +#endif + "eor r12, r4, r6\n\t" + "eor lr, r5, r7\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [%[state], #16]\n\t" + "str r5, [%[state], #20]\n\t" +#else + "strd r4, r5, [%[state], #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [%[state], #56]\n\t" + "str r7, [%[state], #60]\n\t" +#else + "strd r6, r7, [%[state], #56]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [%[state], #96]\n\t" + "str r9, [%[state], #100]\n\t" +#else + "strd r8, r9, [%[state], #96]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [%[state], #136]\n\t" + "str r11, [%[state], #140]\n\t" +#else + "strd r10, r11, [%[state], #136]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #176]\n\t" + "ldr r11, [%[state], #180]\n\t" +#else + "ldrd r10, r11, [%[state], #176]\n\t" +#endif + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [%[state], #176]\n\t" + "str r11, [%[state], #180]\n\t" +#else + "strd r10, r11, [%[state], #176]\n\t" +#endif + "str r12, [sp, #16]\n\t" + "str lr, [sp, #20]\n\t" + /* Calc t[1] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp]\n\t" + "ldr r3, [sp, #4]\n\t" +#else + "ldrd r2, r3, [sp]\n\t" +#endif + "eor r2, r2, lr, lsr #31\n\t" + "eor r3, r3, r12, lsr #31\n\t" + "eor r2, r2, r12, lsl #1\n\t" + "eor r3, r3, lr, lsl #1\n\t" + /* XOR t[1] into s[x*5+1] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #8]\n\t" + "ldr r5, [%[state], #12]\n\t" +#else + "ldrd r4, r5, [%[state], #8]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #48]\n\t" + "ldr r7, [%[state], #52]\n\t" +#else + "ldrd r6, r7, [%[state], #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #88]\n\t" + "ldr r9, [%[state], #92]\n\t" +#else + "ldrd r8, r9, [%[state], #88]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #128]\n\t" + "ldr r11, [%[state], #132]\n\t" +#else + "ldrd r10, r11, [%[state], #128]\n\t" +#endif + "ldr r12, [%[state], #168]\n\t" + "ldr lr, [%[state], #172]\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [%[state], #8]\n\t" + "str r5, [%[state], #12]\n\t" +#else + "strd r4, r5, [%[state], #8]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [%[state], #48]\n\t" + "str r7, [%[state], #52]\n\t" +#else + "strd r6, r7, [%[state], #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [%[state], #88]\n\t" + "str r9, [%[state], #92]\n\t" +#else + "strd r8, r9, [%[state], #88]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [%[state], #128]\n\t" + "str r11, [%[state], #132]\n\t" +#else + "strd r10, r11, [%[state], #128]\n\t" +#endif + "str r12, [%[state], #168]\n\t" + "str lr, [%[state], #172]\n\t" + /* Calc t[3] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp, #16]\n\t" + "ldr r3, [sp, #20]\n\t" +#else + "ldrd r2, r3, [sp, #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #32]\n\t" + "ldr r5, [sp, #36]\n\t" +#else + "ldrd r4, r5, [sp, #32]\n\t" +#endif + "eor r2, r2, r5, lsr #31\n\t" + "eor r3, r3, r4, lsr #31\n\t" + "eor r2, r2, r4, lsl #1\n\t" + "eor r3, r3, r5, lsl #1\n\t" + /* XOR t[3] into s[x*5+3] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #24]\n\t" + "ldr r5, [%[state], #28]\n\t" +#else + "ldrd r4, r5, [%[state], #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #64]\n\t" + "ldr r7, [%[state], #68]\n\t" +#else + "ldrd r6, r7, [%[state], #64]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #104]\n\t" + "ldr r9, [%[state], #108]\n\t" +#else + "ldrd r8, r9, [%[state], #104]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #144]\n\t" + "ldr r11, [%[state], #148]\n\t" +#else + "ldrd r10, r11, [%[state], #144]\n\t" +#endif + "ldr r12, [%[state], #184]\n\t" + "ldr lr, [%[state], #188]\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [%[state], #24]\n\t" + "str r5, [%[state], #28]\n\t" +#else + "strd r4, r5, [%[state], #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [%[state], #64]\n\t" + "str r7, [%[state], #68]\n\t" +#else + "strd r6, r7, [%[state], #64]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [%[state], #104]\n\t" + "str r9, [%[state], #108]\n\t" +#else + "strd r8, r9, [%[state], #104]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [%[state], #144]\n\t" + "str r11, [%[state], #148]\n\t" +#else + "strd r10, r11, [%[state], #144]\n\t" +#endif + "str r12, [%[state], #184]\n\t" + "str lr, [%[state], #188]\n\t" + /* Calc t[4] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp, #24]\n\t" + "ldr r3, [sp, #28]\n\t" +#else + "ldrd r2, r3, [sp, #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp]\n\t" + "ldr r5, [sp, #4]\n\t" +#else + "ldrd r4, r5, [sp]\n\t" +#endif + "eor r2, r2, r5, lsr #31\n\t" + "eor r3, r3, r4, lsr #31\n\t" + "eor r2, r2, r4, lsl #1\n\t" + "eor r3, r3, r5, lsl #1\n\t" + /* XOR t[4] into s[x*5+4] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #32]\n\t" + "ldr r5, [%[state], #36]\n\t" +#else + "ldrd r4, r5, [%[state], #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #72]\n\t" + "ldr r7, [%[state], #76]\n\t" +#else + "ldrd r6, r7, [%[state], #72]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #112]\n\t" + "ldr r9, [%[state], #116]\n\t" +#else + "ldrd r8, r9, [%[state], #112]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #152]\n\t" + "ldr r11, [%[state], #156]\n\t" +#else + "ldrd r10, r11, [%[state], #152]\n\t" +#endif + "ldr r12, [%[state], #192]\n\t" + "ldr lr, [%[state], #196]\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [%[state], #32]\n\t" + "str r5, [%[state], #36]\n\t" +#else + "strd r4, r5, [%[state], #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [%[state], #72]\n\t" + "str r7, [%[state], #76]\n\t" +#else + "strd r6, r7, [%[state], #72]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [%[state], #112]\n\t" + "str r9, [%[state], #116]\n\t" +#else + "strd r8, r9, [%[state], #112]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [%[state], #152]\n\t" + "str r11, [%[state], #156]\n\t" +#else + "strd r10, r11, [%[state], #152]\n\t" +#endif + "str r12, [%[state], #192]\n\t" + "str lr, [%[state], #196]\n\t" + /* Row Mix */ + /* Row 0 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state]]\n\t" + "ldr r3, [%[state], #4]\n\t" +#else + "ldrd r2, r3, [%[state]]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #48]\n\t" + "ldr r5, [%[state], #52]\n\t" +#else + "ldrd r4, r5, [%[state], #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #96]\n\t" + "ldr r7, [%[state], #100]\n\t" +#else + "ldrd r6, r7, [%[state], #96]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #144]\n\t" + "ldr r9, [%[state], #148]\n\t" +#else + "ldrd r8, r9, [%[state], #144]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #192]\n\t" + "ldr r11, [%[state], #196]\n\t" +#else + "ldrd r10, r11, [%[state], #192]\n\t" +#endif + /* s[1] <<< 44 */ + "mov lr, r4\n\t" + "lsr r12, r5, #20\n\t" + "lsr r4, r4, #20\n\t" + "orr r4, r4, r5, lsl #12\n\t" + "orr r5, r12, lr, lsl #12\n\t" + /* s[2] <<< 43 */ + "mov lr, r6\n\t" + "lsr r12, r7, #21\n\t" + "lsr r6, r6, #21\n\t" + "orr r6, r6, r7, lsl #11\n\t" + "orr r7, r12, lr, lsl #11\n\t" + /* s[3] <<< 21 */ + "lsr r12, r9, #11\n\t" + "lsr lr, r8, #11\n\t" + "orr r8, r12, r8, lsl #21\n\t" + "orr r9, lr, r9, lsl #21\n\t" + /* s[4] <<< 14 */ + "lsr r12, r11, #18\n\t" + "lsr lr, r10, #18\n\t" + "orr r10, r12, r10, lsl #14\n\t" + "orr r11, lr, r11, lsl #14\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [sp, #8]\n\t" + "str lr, [sp, #12]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [sp, #16]\n\t" + "str lr, [sp, #20]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [sp, #24]\n\t" + "str lr, [sp, #28]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [sp, #32]\n\t" + "str lr, [sp, #36]\n\t" + /* Get constant */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [r1]\n\t" + "ldr r11, [r1, #4]\n\t" +#else + "ldrd r10, r11, [r1]\n\t" +#endif + "add r1, r1, #8\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + /* XOR in constant */ + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [sp]\n\t" + "str lr, [sp, #4]\n\t" + /* Row 1 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state], #24]\n\t" + "ldr r3, [%[state], #28]\n\t" +#else + "ldrd r2, r3, [%[state], #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #72]\n\t" + "ldr r5, [%[state], #76]\n\t" +#else + "ldrd r4, r5, [%[state], #72]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #80]\n\t" + "ldr r7, [%[state], #84]\n\t" +#else + "ldrd r6, r7, [%[state], #80]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #128]\n\t" + "ldr r9, [%[state], #132]\n\t" +#else + "ldrd r8, r9, [%[state], #128]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #176]\n\t" + "ldr r11, [%[state], #180]\n\t" +#else + "ldrd r10, r11, [%[state], #176]\n\t" +#endif + /* s[0] <<< 28 */ + "lsr r12, r3, #4\n\t" + "lsr lr, r2, #4\n\t" + "orr r2, r12, r2, lsl #28\n\t" + "orr r3, lr, r3, lsl #28\n\t" + /* s[1] <<< 20 */ + "lsr r12, r5, #12\n\t" + "lsr lr, r4, #12\n\t" + "orr r4, r12, r4, lsl #20\n\t" + "orr r5, lr, r5, lsl #20\n\t" + /* s[2] <<< 3 */ + "lsr r12, r7, #29\n\t" + "lsr lr, r6, #29\n\t" + "orr r6, r12, r6, lsl #3\n\t" + "orr r7, lr, r7, lsl #3\n\t" + /* s[3] <<< 45 */ + "mov lr, r8\n\t" + "lsr r12, r9, #19\n\t" + "lsr r8, r8, #19\n\t" + "orr r8, r8, r9, lsl #13\n\t" + "orr r9, r12, lr, lsl #13\n\t" + /* s[4] <<< 61 */ + "mov lr, r10\n\t" + "lsr r12, r11, #3\n\t" + "lsr r10, r10, #3\n\t" + "orr r10, r10, r11, lsl #29\n\t" + "orr r11, r12, lr, lsl #29\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [sp, #48]\n\t" + "str lr, [sp, #52]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [sp, #56]\n\t" + "str lr, [sp, #60]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [sp, #64]\n\t" + "str lr, [sp, #68]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [sp, #72]\n\t" + "str lr, [sp, #76]\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + "str r12, [sp, #40]\n\t" + "str lr, [sp, #44]\n\t" + /* Row 2 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state], #8]\n\t" + "ldr r3, [%[state], #12]\n\t" +#else + "ldrd r2, r3, [%[state], #8]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #56]\n\t" + "ldr r5, [%[state], #60]\n\t" +#else + "ldrd r4, r5, [%[state], #56]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #104]\n\t" + "ldr r7, [%[state], #108]\n\t" +#else + "ldrd r6, r7, [%[state], #104]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #152]\n\t" + "ldr r9, [%[state], #156]\n\t" +#else + "ldrd r8, r9, [%[state], #152]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #160]\n\t" + "ldr r11, [%[state], #164]\n\t" +#else + "ldrd r10, r11, [%[state], #160]\n\t" +#endif + /* s[0] <<< 1 */ + "lsr r12, r3, #31\n\t" + "lsr lr, r2, #31\n\t" + "orr r2, r12, r2, lsl #1\n\t" + "orr r3, lr, r3, lsl #1\n\t" + /* s[1] <<< 6 */ + "lsr r12, r5, #26\n\t" + "lsr lr, r4, #26\n\t" + "orr r4, r12, r4, lsl #6\n\t" + "orr r5, lr, r5, lsl #6\n\t" + /* s[2] <<< 25 */ + "lsr r12, r7, #7\n\t" + "lsr lr, r6, #7\n\t" + "orr r6, r12, r6, lsl #25\n\t" + "orr r7, lr, r7, lsl #25\n\t" + /* s[3] <<< 8 */ + "lsr r12, r9, #24\n\t" + "lsr lr, r8, #24\n\t" + "orr r8, r12, r8, lsl #8\n\t" + "orr r9, lr, r9, lsl #8\n\t" + /* s[4] <<< 18 */ + "lsr r12, r11, #14\n\t" + "lsr lr, r10, #14\n\t" + "orr r10, r12, r10, lsl #18\n\t" + "orr r11, lr, r11, lsl #18\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [sp, #88]\n\t" + "str lr, [sp, #92]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [sp, #96]\n\t" + "str lr, [sp, #100]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [sp, #104]\n\t" + "str lr, [sp, #108]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [sp, #112]\n\t" + "str lr, [sp, #116]\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + "str r12, [sp, #80]\n\t" + "str lr, [sp, #84]\n\t" + /* Row 3 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state], #32]\n\t" + "ldr r3, [%[state], #36]\n\t" +#else + "ldrd r2, r3, [%[state], #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #40]\n\t" + "ldr r5, [%[state], #44]\n\t" +#else + "ldrd r4, r5, [%[state], #40]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #88]\n\t" + "ldr r7, [%[state], #92]\n\t" +#else + "ldrd r6, r7, [%[state], #88]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #136]\n\t" + "ldr r9, [%[state], #140]\n\t" +#else + "ldrd r8, r9, [%[state], #136]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #184]\n\t" + "ldr r11, [%[state], #188]\n\t" +#else + "ldrd r10, r11, [%[state], #184]\n\t" +#endif + /* s[0] <<< 27 */ + "lsr r12, r3, #5\n\t" + "lsr lr, r2, #5\n\t" + "orr r2, r12, r2, lsl #27\n\t" + "orr r3, lr, r3, lsl #27\n\t" + /* s[1] <<< 36 */ + "mov lr, r4\n\t" + "lsr r12, r5, #28\n\t" + "lsr r4, r4, #28\n\t" + "orr r4, r4, r5, lsl #4\n\t" + "orr r5, r12, lr, lsl #4\n\t" + /* s[2] <<< 10 */ + "lsr r12, r7, #22\n\t" + "lsr lr, r6, #22\n\t" + "orr r6, r12, r6, lsl #10\n\t" + "orr r7, lr, r7, lsl #10\n\t" + /* s[3] <<< 15 */ + "lsr r12, r9, #17\n\t" + "lsr lr, r8, #17\n\t" + "orr r8, r12, r8, lsl #15\n\t" + "orr r9, lr, r9, lsl #15\n\t" + /* s[4] <<< 56 */ + "mov lr, r10\n\t" + "lsr r12, r11, #8\n\t" + "lsr r10, r10, #8\n\t" + "orr r10, r10, r11, lsl #24\n\t" + "orr r11, r12, lr, lsl #24\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [sp, #128]\n\t" + "str lr, [sp, #132]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [sp, #136]\n\t" + "str lr, [sp, #140]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [sp, #144]\n\t" + "str lr, [sp, #148]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [sp, #152]\n\t" + "str lr, [sp, #156]\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + "str r12, [sp, #120]\n\t" + "str lr, [sp, #124]\n\t" + /* Row 4 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state], #16]\n\t" + "ldr r3, [%[state], #20]\n\t" +#else + "ldrd r2, r3, [%[state], #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #64]\n\t" + "ldr r5, [%[state], #68]\n\t" +#else + "ldrd r4, r5, [%[state], #64]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [%[state], #112]\n\t" + "ldr r7, [%[state], #116]\n\t" +#else + "ldrd r6, r7, [%[state], #112]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [%[state], #120]\n\t" + "ldr r9, [%[state], #124]\n\t" +#else + "ldrd r8, r9, [%[state], #120]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [%[state], #168]\n\t" + "ldr r11, [%[state], #172]\n\t" +#else + "ldrd r10, r11, [%[state], #168]\n\t" +#endif + /* s[0] <<< 62 */ + "mov lr, r2\n\t" + "lsr r12, r3, #2\n\t" + "lsr r2, r2, #2\n\t" + "orr r2, r2, r3, lsl #30\n\t" + "orr r3, r12, lr, lsl #30\n\t" + /* s[1] <<< 55 */ + "mov lr, r4\n\t" + "lsr r12, r5, #9\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "orr r5, r12, lr, lsl #23\n\t" + /* s[2] <<< 39 */ + "mov lr, r6\n\t" + "lsr r12, r7, #25\n\t" + "lsr r6, r6, #25\n\t" + "orr r6, r6, r7, lsl #7\n\t" + "orr r7, r12, lr, lsl #7\n\t" + /* s[3] <<< 41 */ + "mov lr, r8\n\t" + "lsr r12, r9, #23\n\t" + "lsr r8, r8, #23\n\t" + "orr r8, r8, r9, lsl #9\n\t" + "orr r9, r12, lr, lsl #9\n\t" + /* s[4] <<< 2 */ + "lsr r12, r11, #30\n\t" + "lsr lr, r10, #30\n\t" + "orr r10, r12, r10, lsl #2\n\t" + "orr r11, lr, r11, lsl #2\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [sp, #168]\n\t" + "str lr, [sp, #172]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [sp, #176]\n\t" + "str lr, [sp, #180]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [sp, #184]\n\t" + "str lr, [sp, #188]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [sp, #192]\n\t" + "str lr, [sp, #196]\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + "str r12, [sp, #160]\n\t" + "str lr, [sp, #164]\n\t" + /* Round odd */ + /* Calc b[4] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #32]\n\t" + "ldr r5, [sp, #36]\n\t" +#else + "ldrd r4, r5, [sp, #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #72]\n\t" + "ldr r7, [sp, #76]\n\t" +#else + "ldrd r6, r7, [sp, #72]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #112]\n\t" + "ldr r9, [sp, #116]\n\t" +#else + "ldrd r8, r9, [sp, #112]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #152]\n\t" + "ldr r11, [sp, #156]\n\t" +#else + "ldrd r10, r11, [sp, #152]\n\t" +#endif + "ldr r12, [sp, #192]\n\t" + "ldr lr, [sp, #196]\n\t" + "eor r2, r4, r6\n\t" + "eor r3, r5, r7\n\t" + "eor r2, r2, r8\n\t" + "eor r3, r3, r9\n\t" + "eor r2, r2, r10\n\t" + "eor r3, r3, r11\n\t" + "eor r2, r2, r12\n\t" + "eor r3, r3, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r2, [%[state], #32]\n\t" + "str r3, [%[state], #36]\n\t" +#else + "strd r2, r3, [%[state], #32]\n\t" +#endif + /* Calc b[1] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #8]\n\t" + "ldr r5, [sp, #12]\n\t" +#else + "ldrd r4, r5, [sp, #8]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #48]\n\t" + "ldr r7, [sp, #52]\n\t" +#else + "ldrd r6, r7, [sp, #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #88]\n\t" + "ldr r9, [sp, #92]\n\t" +#else + "ldrd r8, r9, [sp, #88]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #128]\n\t" + "ldr r11, [sp, #132]\n\t" +#else + "ldrd r10, r11, [sp, #128]\n\t" +#endif + "ldr r12, [sp, #168]\n\t" + "ldr lr, [sp, #172]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r4, r4, r10\n\t" + "eor r5, r5, r11\n\t" + "eor r4, r4, r12\n\t" + "eor r5, r5, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [%[state], #8]\n\t" + "str r5, [%[state], #12]\n\t" +#else + "strd r4, r5, [%[state], #8]\n\t" +#endif + /* Calc t[0] */ + "eor r2, r2, r5, lsr #31\n\t" + "eor r3, r3, r4, lsr #31\n\t" + "eor r2, r2, r4, lsl #1\n\t" + "eor r3, r3, r5, lsl #1\n\t" + /* Calc b[0] and XOR t[0] into s[x*5+0] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp]\n\t" + "ldr r5, [sp, #4]\n\t" +#else + "ldrd r4, r5, [sp]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #40]\n\t" + "ldr r7, [sp, #44]\n\t" +#else + "ldrd r6, r7, [sp, #40]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #80]\n\t" + "ldr r9, [sp, #84]\n\t" +#else + "ldrd r8, r9, [sp, #80]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #120]\n\t" + "ldr r11, [sp, #124]\n\t" +#else + "ldrd r10, r11, [sp, #120]\n\t" +#endif + "eor r12, r4, r6\n\t" + "eor lr, r5, r7\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp]\n\t" + "str r5, [sp, #4]\n\t" +#else + "strd r4, r5, [sp]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #40]\n\t" + "str r7, [sp, #44]\n\t" +#else + "strd r6, r7, [sp, #40]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [sp, #80]\n\t" + "str r9, [sp, #84]\n\t" +#else + "strd r8, r9, [sp, #80]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [sp, #120]\n\t" + "str r11, [sp, #124]\n\t" +#else + "strd r10, r11, [sp, #120]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #160]\n\t" + "ldr r11, [sp, #164]\n\t" +#else + "ldrd r10, r11, [sp, #160]\n\t" +#endif + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [sp, #160]\n\t" + "str r11, [sp, #164]\n\t" +#else + "strd r10, r11, [sp, #160]\n\t" +#endif + "str r12, [%[state]]\n\t" + "str lr, [%[state], #4]\n\t" + /* Calc b[3] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #24]\n\t" + "ldr r5, [sp, #28]\n\t" +#else + "ldrd r4, r5, [sp, #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #64]\n\t" + "ldr r7, [sp, #68]\n\t" +#else + "ldrd r6, r7, [sp, #64]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #104]\n\t" + "ldr r9, [sp, #108]\n\t" +#else + "ldrd r8, r9, [sp, #104]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #144]\n\t" + "ldr r11, [sp, #148]\n\t" +#else + "ldrd r10, r11, [sp, #144]\n\t" +#endif + "ldr r12, [sp, #184]\n\t" + "ldr lr, [sp, #188]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r4, r4, r10\n\t" + "eor r5, r5, r11\n\t" + "eor r4, r4, r12\n\t" + "eor r5, r5, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [%[state], #24]\n\t" + "str r5, [%[state], #28]\n\t" +#else + "strd r4, r5, [%[state], #24]\n\t" +#endif + /* Calc t[2] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state], #8]\n\t" + "ldr r3, [%[state], #12]\n\t" +#else + "ldrd r2, r3, [%[state], #8]\n\t" +#endif + "eor r2, r2, r5, lsr #31\n\t" + "eor r3, r3, r4, lsr #31\n\t" + "eor r2, r2, r4, lsl #1\n\t" + "eor r3, r3, r5, lsl #1\n\t" + /* Calc b[2] and XOR t[2] into s[x*5+2] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #16]\n\t" + "ldr r5, [sp, #20]\n\t" +#else + "ldrd r4, r5, [sp, #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #56]\n\t" + "ldr r7, [sp, #60]\n\t" +#else + "ldrd r6, r7, [sp, #56]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #96]\n\t" + "ldr r9, [sp, #100]\n\t" +#else + "ldrd r8, r9, [sp, #96]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #136]\n\t" + "ldr r11, [sp, #140]\n\t" +#else + "ldrd r10, r11, [sp, #136]\n\t" +#endif + "eor r12, r4, r6\n\t" + "eor lr, r5, r7\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #16]\n\t" + "str r5, [sp, #20]\n\t" +#else + "strd r4, r5, [sp, #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #56]\n\t" + "str r7, [sp, #60]\n\t" +#else + "strd r6, r7, [sp, #56]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [sp, #96]\n\t" + "str r9, [sp, #100]\n\t" +#else + "strd r8, r9, [sp, #96]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [sp, #136]\n\t" + "str r11, [sp, #140]\n\t" +#else + "strd r10, r11, [sp, #136]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #176]\n\t" + "ldr r11, [sp, #180]\n\t" +#else + "ldrd r10, r11, [sp, #176]\n\t" +#endif + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [sp, #176]\n\t" + "str r11, [sp, #180]\n\t" +#else + "strd r10, r11, [sp, #176]\n\t" +#endif + "str r12, [%[state], #16]\n\t" + "str lr, [%[state], #20]\n\t" + /* Calc t[1] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state]]\n\t" + "ldr r3, [%[state], #4]\n\t" +#else + "ldrd r2, r3, [%[state]]\n\t" +#endif + "eor r2, r2, lr, lsr #31\n\t" + "eor r3, r3, r12, lsr #31\n\t" + "eor r2, r2, r12, lsl #1\n\t" + "eor r3, r3, lr, lsl #1\n\t" + /* XOR t[1] into s[x*5+1] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #8]\n\t" + "ldr r5, [sp, #12]\n\t" +#else + "ldrd r4, r5, [sp, #8]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #48]\n\t" + "ldr r7, [sp, #52]\n\t" +#else + "ldrd r6, r7, [sp, #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #88]\n\t" + "ldr r9, [sp, #92]\n\t" +#else + "ldrd r8, r9, [sp, #88]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #128]\n\t" + "ldr r11, [sp, #132]\n\t" +#else + "ldrd r10, r11, [sp, #128]\n\t" +#endif + "ldr r12, [sp, #168]\n\t" + "ldr lr, [sp, #172]\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #8]\n\t" + "str r5, [sp, #12]\n\t" +#else + "strd r4, r5, [sp, #8]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #48]\n\t" + "str r7, [sp, #52]\n\t" +#else + "strd r6, r7, [sp, #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [sp, #88]\n\t" + "str r9, [sp, #92]\n\t" +#else + "strd r8, r9, [sp, #88]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [sp, #128]\n\t" + "str r11, [sp, #132]\n\t" +#else + "strd r10, r11, [sp, #128]\n\t" +#endif + "str r12, [sp, #168]\n\t" + "str lr, [sp, #172]\n\t" + /* Calc t[3] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state], #16]\n\t" + "ldr r3, [%[state], #20]\n\t" +#else + "ldrd r2, r3, [%[state], #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state], #32]\n\t" + "ldr r5, [%[state], #36]\n\t" +#else + "ldrd r4, r5, [%[state], #32]\n\t" +#endif + "eor r2, r2, r5, lsr #31\n\t" + "eor r3, r3, r4, lsr #31\n\t" + "eor r2, r2, r4, lsl #1\n\t" + "eor r3, r3, r5, lsl #1\n\t" + /* XOR t[3] into s[x*5+3] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #24]\n\t" + "ldr r5, [sp, #28]\n\t" +#else + "ldrd r4, r5, [sp, #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #64]\n\t" + "ldr r7, [sp, #68]\n\t" +#else + "ldrd r6, r7, [sp, #64]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #104]\n\t" + "ldr r9, [sp, #108]\n\t" +#else + "ldrd r8, r9, [sp, #104]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #144]\n\t" + "ldr r11, [sp, #148]\n\t" +#else + "ldrd r10, r11, [sp, #144]\n\t" +#endif + "ldr r12, [sp, #184]\n\t" + "ldr lr, [sp, #188]\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #24]\n\t" + "str r5, [sp, #28]\n\t" +#else + "strd r4, r5, [sp, #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #64]\n\t" + "str r7, [sp, #68]\n\t" +#else + "strd r6, r7, [sp, #64]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [sp, #104]\n\t" + "str r9, [sp, #108]\n\t" +#else + "strd r8, r9, [sp, #104]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [sp, #144]\n\t" + "str r11, [sp, #148]\n\t" +#else + "strd r10, r11, [sp, #144]\n\t" +#endif + "str r12, [sp, #184]\n\t" + "str lr, [sp, #188]\n\t" + /* Calc t[4] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [%[state], #24]\n\t" + "ldr r3, [%[state], #28]\n\t" +#else + "ldrd r2, r3, [%[state], #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[state]]\n\t" + "ldr r5, [%[state], #4]\n\t" +#else + "ldrd r4, r5, [%[state]]\n\t" +#endif + "eor r2, r2, r5, lsr #31\n\t" + "eor r3, r3, r4, lsr #31\n\t" + "eor r2, r2, r4, lsl #1\n\t" + "eor r3, r3, r5, lsl #1\n\t" + /* XOR t[4] into s[x*5+4] */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #32]\n\t" + "ldr r5, [sp, #36]\n\t" +#else + "ldrd r4, r5, [sp, #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #72]\n\t" + "ldr r7, [sp, #76]\n\t" +#else + "ldrd r6, r7, [sp, #72]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #112]\n\t" + "ldr r9, [sp, #116]\n\t" +#else + "ldrd r8, r9, [sp, #112]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #152]\n\t" + "ldr r11, [sp, #156]\n\t" +#else + "ldrd r10, r11, [sp, #152]\n\t" +#endif + "ldr r12, [sp, #192]\n\t" + "ldr lr, [sp, #196]\n\t" + "eor r4, r4, r2\n\t" + "eor r5, r5, r3\n\t" + "eor r6, r6, r2\n\t" + "eor r7, r7, r3\n\t" + "eor r8, r8, r2\n\t" + "eor r9, r9, r3\n\t" + "eor r10, r10, r2\n\t" + "eor r11, r11, r3\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #32]\n\t" + "str r5, [sp, #36]\n\t" +#else + "strd r4, r5, [sp, #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #72]\n\t" + "str r7, [sp, #76]\n\t" +#else + "strd r6, r7, [sp, #72]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [sp, #112]\n\t" + "str r9, [sp, #116]\n\t" +#else + "strd r8, r9, [sp, #112]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r10, [sp, #152]\n\t" + "str r11, [sp, #156]\n\t" +#else + "strd r10, r11, [sp, #152]\n\t" +#endif + "str r12, [sp, #192]\n\t" + "str lr, [sp, #196]\n\t" + /* Row Mix */ + /* Row 0 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp]\n\t" + "ldr r3, [sp, #4]\n\t" +#else + "ldrd r2, r3, [sp]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #48]\n\t" + "ldr r5, [sp, #52]\n\t" +#else + "ldrd r4, r5, [sp, #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #96]\n\t" + "ldr r7, [sp, #100]\n\t" +#else + "ldrd r6, r7, [sp, #96]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #144]\n\t" + "ldr r9, [sp, #148]\n\t" +#else + "ldrd r8, r9, [sp, #144]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #192]\n\t" + "ldr r11, [sp, #196]\n\t" +#else + "ldrd r10, r11, [sp, #192]\n\t" +#endif + /* s[1] <<< 44 */ + "mov lr, r4\n\t" + "lsr r12, r5, #20\n\t" + "lsr r4, r4, #20\n\t" + "orr r4, r4, r5, lsl #12\n\t" + "orr r5, r12, lr, lsl #12\n\t" + /* s[2] <<< 43 */ + "mov lr, r6\n\t" + "lsr r12, r7, #21\n\t" + "lsr r6, r6, #21\n\t" + "orr r6, r6, r7, lsl #11\n\t" + "orr r7, r12, lr, lsl #11\n\t" + /* s[3] <<< 21 */ + "lsr r12, r9, #11\n\t" + "lsr lr, r8, #11\n\t" + "orr r8, r12, r8, lsl #21\n\t" + "orr r9, lr, r9, lsl #21\n\t" + /* s[4] <<< 14 */ + "lsr r12, r11, #18\n\t" + "lsr lr, r10, #18\n\t" + "orr r10, r12, r10, lsl #14\n\t" + "orr r11, lr, r11, lsl #14\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [%[state], #8]\n\t" + "str lr, [%[state], #12]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [%[state], #16]\n\t" + "str lr, [%[state], #20]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [%[state], #24]\n\t" + "str lr, [%[state], #28]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [%[state], #32]\n\t" + "str lr, [%[state], #36]\n\t" + /* Get constant */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [r1]\n\t" + "ldr r11, [r1, #4]\n\t" +#else + "ldrd r10, r11, [r1]\n\t" +#endif + "add r1, r1, #8\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + /* XOR in constant */ + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [%[state]]\n\t" + "str lr, [%[state], #4]\n\t" + /* Row 1 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp, #24]\n\t" + "ldr r3, [sp, #28]\n\t" +#else + "ldrd r2, r3, [sp, #24]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #72]\n\t" + "ldr r5, [sp, #76]\n\t" +#else + "ldrd r4, r5, [sp, #72]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #80]\n\t" + "ldr r7, [sp, #84]\n\t" +#else + "ldrd r6, r7, [sp, #80]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #128]\n\t" + "ldr r9, [sp, #132]\n\t" +#else + "ldrd r8, r9, [sp, #128]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #176]\n\t" + "ldr r11, [sp, #180]\n\t" +#else + "ldrd r10, r11, [sp, #176]\n\t" +#endif + /* s[0] <<< 28 */ + "lsr r12, r3, #4\n\t" + "lsr lr, r2, #4\n\t" + "orr r2, r12, r2, lsl #28\n\t" + "orr r3, lr, r3, lsl #28\n\t" + /* s[1] <<< 20 */ + "lsr r12, r5, #12\n\t" + "lsr lr, r4, #12\n\t" + "orr r4, r12, r4, lsl #20\n\t" + "orr r5, lr, r5, lsl #20\n\t" + /* s[2] <<< 3 */ + "lsr r12, r7, #29\n\t" + "lsr lr, r6, #29\n\t" + "orr r6, r12, r6, lsl #3\n\t" + "orr r7, lr, r7, lsl #3\n\t" + /* s[3] <<< 45 */ + "mov lr, r8\n\t" + "lsr r12, r9, #19\n\t" + "lsr r8, r8, #19\n\t" + "orr r8, r8, r9, lsl #13\n\t" + "orr r9, r12, lr, lsl #13\n\t" + /* s[4] <<< 61 */ + "mov lr, r10\n\t" + "lsr r12, r11, #3\n\t" + "lsr r10, r10, #3\n\t" + "orr r10, r10, r11, lsl #29\n\t" + "orr r11, r12, lr, lsl #29\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [%[state], #48]\n\t" + "str lr, [%[state], #52]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [%[state], #56]\n\t" + "str lr, [%[state], #60]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [%[state], #64]\n\t" + "str lr, [%[state], #68]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [%[state], #72]\n\t" + "str lr, [%[state], #76]\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + "str r12, [%[state], #40]\n\t" + "str lr, [%[state], #44]\n\t" + /* Row 2 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp, #8]\n\t" + "ldr r3, [sp, #12]\n\t" +#else + "ldrd r2, r3, [sp, #8]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #56]\n\t" + "ldr r5, [sp, #60]\n\t" +#else + "ldrd r4, r5, [sp, #56]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #104]\n\t" + "ldr r7, [sp, #108]\n\t" +#else + "ldrd r6, r7, [sp, #104]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #152]\n\t" + "ldr r9, [sp, #156]\n\t" +#else + "ldrd r8, r9, [sp, #152]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #160]\n\t" + "ldr r11, [sp, #164]\n\t" +#else + "ldrd r10, r11, [sp, #160]\n\t" +#endif + /* s[0] <<< 1 */ + "lsr r12, r3, #31\n\t" + "lsr lr, r2, #31\n\t" + "orr r2, r12, r2, lsl #1\n\t" + "orr r3, lr, r3, lsl #1\n\t" + /* s[1] <<< 6 */ + "lsr r12, r5, #26\n\t" + "lsr lr, r4, #26\n\t" + "orr r4, r12, r4, lsl #6\n\t" + "orr r5, lr, r5, lsl #6\n\t" + /* s[2] <<< 25 */ + "lsr r12, r7, #7\n\t" + "lsr lr, r6, #7\n\t" + "orr r6, r12, r6, lsl #25\n\t" + "orr r7, lr, r7, lsl #25\n\t" + /* s[3] <<< 8 */ + "lsr r12, r9, #24\n\t" + "lsr lr, r8, #24\n\t" + "orr r8, r12, r8, lsl #8\n\t" + "orr r9, lr, r9, lsl #8\n\t" + /* s[4] <<< 18 */ + "lsr r12, r11, #14\n\t" + "lsr lr, r10, #14\n\t" + "orr r10, r12, r10, lsl #18\n\t" + "orr r11, lr, r11, lsl #18\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [%[state], #88]\n\t" + "str lr, [%[state], #92]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [%[state], #96]\n\t" + "str lr, [%[state], #100]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [%[state], #104]\n\t" + "str lr, [%[state], #108]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [%[state], #112]\n\t" + "str lr, [%[state], #116]\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + "str r12, [%[state], #80]\n\t" + "str lr, [%[state], #84]\n\t" + /* Row 3 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp, #32]\n\t" + "ldr r3, [sp, #36]\n\t" +#else + "ldrd r2, r3, [sp, #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #40]\n\t" + "ldr r5, [sp, #44]\n\t" +#else + "ldrd r4, r5, [sp, #40]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #88]\n\t" + "ldr r7, [sp, #92]\n\t" +#else + "ldrd r6, r7, [sp, #88]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #136]\n\t" + "ldr r9, [sp, #140]\n\t" +#else + "ldrd r8, r9, [sp, #136]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #184]\n\t" + "ldr r11, [sp, #188]\n\t" +#else + "ldrd r10, r11, [sp, #184]\n\t" +#endif + /* s[0] <<< 27 */ + "lsr r12, r3, #5\n\t" + "lsr lr, r2, #5\n\t" + "orr r2, r12, r2, lsl #27\n\t" + "orr r3, lr, r3, lsl #27\n\t" + /* s[1] <<< 36 */ + "mov lr, r4\n\t" + "lsr r12, r5, #28\n\t" + "lsr r4, r4, #28\n\t" + "orr r4, r4, r5, lsl #4\n\t" + "orr r5, r12, lr, lsl #4\n\t" + /* s[2] <<< 10 */ + "lsr r12, r7, #22\n\t" + "lsr lr, r6, #22\n\t" + "orr r6, r12, r6, lsl #10\n\t" + "orr r7, lr, r7, lsl #10\n\t" + /* s[3] <<< 15 */ + "lsr r12, r9, #17\n\t" + "lsr lr, r8, #17\n\t" + "orr r8, r12, r8, lsl #15\n\t" + "orr r9, lr, r9, lsl #15\n\t" + /* s[4] <<< 56 */ + "mov lr, r10\n\t" + "lsr r12, r11, #8\n\t" + "lsr r10, r10, #8\n\t" + "orr r10, r10, r11, lsl #24\n\t" + "orr r11, r12, lr, lsl #24\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [%[state], #128]\n\t" + "str lr, [%[state], #132]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [%[state], #136]\n\t" + "str lr, [%[state], #140]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [%[state], #144]\n\t" + "str lr, [%[state], #148]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [%[state], #152]\n\t" + "str lr, [%[state], #156]\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + "str r12, [%[state], #120]\n\t" + "str lr, [%[state], #124]\n\t" + /* Row 4 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r2, [sp, #16]\n\t" + "ldr r3, [sp, #20]\n\t" +#else + "ldrd r2, r3, [sp, #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [sp, #64]\n\t" + "ldr r5, [sp, #68]\n\t" +#else + "ldrd r4, r5, [sp, #64]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r6, [sp, #112]\n\t" + "ldr r7, [sp, #116]\n\t" +#else + "ldrd r6, r7, [sp, #112]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r8, [sp, #120]\n\t" + "ldr r9, [sp, #124]\n\t" +#else + "ldrd r8, r9, [sp, #120]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r10, [sp, #168]\n\t" + "ldr r11, [sp, #172]\n\t" +#else + "ldrd r10, r11, [sp, #168]\n\t" +#endif + /* s[0] <<< 62 */ + "mov lr, r2\n\t" + "lsr r12, r3, #2\n\t" + "lsr r2, r2, #2\n\t" + "orr r2, r2, r3, lsl #30\n\t" + "orr r3, r12, lr, lsl #30\n\t" + /* s[1] <<< 55 */ + "mov lr, r4\n\t" + "lsr r12, r5, #9\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "orr r5, r12, lr, lsl #23\n\t" + /* s[2] <<< 39 */ + "mov lr, r6\n\t" + "lsr r12, r7, #25\n\t" + "lsr r6, r6, #25\n\t" + "orr r6, r6, r7, lsl #7\n\t" + "orr r7, r12, lr, lsl #7\n\t" + /* s[3] <<< 41 */ + "mov lr, r8\n\t" + "lsr r12, r9, #23\n\t" + "lsr r8, r8, #23\n\t" + "orr r8, r8, r9, lsl #9\n\t" + "orr r9, r12, lr, lsl #9\n\t" + /* s[4] <<< 2 */ + "lsr r12, r11, #30\n\t" + "lsr lr, r10, #30\n\t" + "orr r10, r12, r10, lsl #2\n\t" + "orr r11, lr, r11, lsl #2\n\t" + "bic r12, r8, r6\n\t" + "bic lr, r9, r7\n\t" + "eor r12, r12, r4\n\t" + "eor lr, lr, r5\n\t" + "str r12, [%[state], #168]\n\t" + "str lr, [%[state], #172]\n\t" + "bic r12, r10, r8\n\t" + "bic lr, r11, r9\n\t" + "eor r12, r12, r6\n\t" + "eor lr, lr, r7\n\t" + "str r12, [%[state], #176]\n\t" + "str lr, [%[state], #180]\n\t" + "bic r12, r2, r10\n\t" + "bic lr, r3, r11\n\t" + "eor r12, r12, r8\n\t" + "eor lr, lr, r9\n\t" + "str r12, [%[state], #184]\n\t" + "str lr, [%[state], #188]\n\t" + "bic r12, r4, r2\n\t" + "bic lr, r5, r3\n\t" + "eor r12, r12, r10\n\t" + "eor lr, lr, r11\n\t" + "str r12, [%[state], #192]\n\t" + "str lr, [%[state], #196]\n\t" + "bic r12, r6, r4\n\t" + "bic lr, r7, r5\n\t" + "eor r12, r12, r2\n\t" + "eor lr, lr, r3\n\t" + "str r12, [%[state], #160]\n\t" + "str lr, [%[state], #164]\n\t" + "ldr r2, [sp, #200]\n\t" + "subs r2, r2, #1\n\t" + "bne L_sha3_arm32_begin_%=\n\t" + "add sp, sp, #0xcc\n\t" + : [state] "+r" (state), [L_sha3_arm2_rt] "+r" (L_sha3_arm2_rt_c) + : + : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", + "r9", "r10", "r11" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA3 */ +#endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ + +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm.S b/wolfcrypt/src/port/arm/thumb2-aes-asm.S new file mode 100644 index 000000000..362a0ab80 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm.S @@ -0,0 +1,3369 @@ +/* thumb2-aes-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./aes/aes.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-aes-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#ifdef WOLFSSL_ARMASM_THUMB2 +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifndef NO_AES +#ifdef HAVE_AES_DECRYPT + .text + .type L_AES_Thumb2_td_data, %object + .size L_AES_Thumb2_td_data, 1024 + .align 4 +L_AES_Thumb2_td_data: + .word 0x5051f4a7 + .word 0x537e4165 + .word 0xc31a17a4 + .word 0x963a275e + .word 0xcb3bab6b + .word 0xf11f9d45 + .word 0xabacfa58 + .word 0x934be303 + .word 0x552030fa + .word 0xf6ad766d + .word 0x9188cc76 + .word 0x25f5024c + .word 0xfc4fe5d7 + .word 0xd7c52acb + .word 0x80263544 + .word 0x8fb562a3 + .word 0x49deb15a + .word 0x6725ba1b + .word 0x9845ea0e + .word 0xe15dfec0 + .word 0x2c32f75 + .word 0x12814cf0 + .word 0xa38d4697 + .word 0xc66bd3f9 + .word 0xe7038f5f + .word 0x9515929c + .word 0xebbf6d7a + .word 0xda955259 + .word 0x2dd4be83 + .word 0xd3587421 + .word 0x2949e069 + .word 0x448ec9c8 + .word 0x6a75c289 + .word 0x78f48e79 + .word 0x6b99583e + .word 0xdd27b971 + .word 0xb6bee14f + .word 0x17f088ad + .word 0x66c920ac + .word 0xb47dce3a + .word 0x1863df4a + .word 0x82e51a31 + .word 0x60975133 + .word 0x4562537f + .word 0xe0b16477 + .word 0x84bb6bae + .word 0x1cfe81a0 + .word 0x94f9082b + .word 0x58704868 + .word 0x198f45fd + .word 0x8794de6c + .word 0xb7527bf8 + .word 0x23ab73d3 + .word 0xe2724b02 + .word 0x57e31f8f + .word 0x2a6655ab + .word 0x7b2eb28 + .word 0x32fb5c2 + .word 0x9a86c57b + .word 0xa5d33708 + .word 0xf2302887 + .word 0xb223bfa5 + .word 0xba02036a + .word 0x5ced1682 + .word 0x2b8acf1c + .word 0x92a779b4 + .word 0xf0f307f2 + .word 0xa14e69e2 + .word 0xcd65daf4 + .word 0xd50605be + .word 0x1fd13462 + .word 0x8ac4a6fe + .word 0x9d342e53 + .word 0xa0a2f355 + .word 0x32058ae1 + .word 0x75a4f6eb + .word 0x390b83ec + .word 0xaa4060ef + .word 0x65e719f + .word 0x51bd6e10 + .word 0xf93e218a + .word 0x3d96dd06 + .word 0xaedd3e05 + .word 0x464de6bd + .word 0xb591548d + .word 0x571c45d + .word 0x6f0406d4 + .word 0xff605015 + .word 0x241998fb + .word 0x97d6bde9 + .word 0xcc894043 + .word 0x7767d99e + .word 0xbdb0e842 + .word 0x8807898b + .word 0x38e7195b + .word 0xdb79c8ee + .word 0x47a17c0a + .word 0xe97c420f + .word 0xc9f8841e + .word 0x0 + .word 0x83098086 + .word 0x48322bed + .word 0xac1e1170 + .word 0x4e6c5a72 + .word 0xfbfd0eff + .word 0x560f8538 + .word 0x1e3daed5 + .word 0x27362d39 + .word 0x640a0fd9 + .word 0x21685ca6 + .word 0xd19b5b54 + .word 0x3a24362e + .word 0xb10c0a67 + .word 0xf9357e7 + .word 0xd2b4ee96 + .word 0x9e1b9b91 + .word 0x4f80c0c5 + .word 0xa261dc20 + .word 0x695a774b + .word 0x161c121a + .word 0xae293ba + .word 0xe5c0a02a + .word 0x433c22e0 + .word 0x1d121b17 + .word 0xb0e090d + .word 0xadf28bc7 + .word 0xb92db6a8 + .word 0xc8141ea9 + .word 0x8557f119 + .word 0x4caf7507 + .word 0xbbee99dd + .word 0xfda37f60 + .word 0x9ff70126 + .word 0xbc5c72f5 + .word 0xc544663b + .word 0x345bfb7e + .word 0x768b4329 + .word 0xdccb23c6 + .word 0x68b6edfc + .word 0x63b8e4f1 + .word 0xcad731dc + .word 0x10426385 + .word 0x40139722 + .word 0x2084c611 + .word 0x7d854a24 + .word 0xf8d2bb3d + .word 0x11aef932 + .word 0x6dc729a1 + .word 0x4b1d9e2f + .word 0xf3dcb230 + .word 0xec0d8652 + .word 0xd077c1e3 + .word 0x6c2bb316 + .word 0x99a970b9 + .word 0xfa119448 + .word 0x2247e964 + .word 0xc4a8fc8c + .word 0x1aa0f03f + .word 0xd8567d2c + .word 0xef223390 + .word 0xc787494e + .word 0xc1d938d1 + .word 0xfe8ccaa2 + .word 0x3698d40b + .word 0xcfa6f581 + .word 0x28a57ade + .word 0x26dab78e + .word 0xa43fadbf + .word 0xe42c3a9d + .word 0xd507892 + .word 0x9b6a5fcc + .word 0x62547e46 + .word 0xc2f68d13 + .word 0xe890d8b8 + .word 0x5e2e39f7 + .word 0xf582c3af + .word 0xbe9f5d80 + .word 0x7c69d093 + .word 0xa96fd52d + .word 0xb3cf2512 + .word 0x3bc8ac99 + .word 0xa710187d + .word 0x6ee89c63 + .word 0x7bdb3bbb + .word 0x9cd2678 + .word 0xf46e5918 + .word 0x1ec9ab7 + .word 0xa8834f9a + .word 0x65e6956e + .word 0x7eaaffe6 + .word 0x821bccf + .word 0xe6ef15e8 + .word 0xd9bae79b + .word 0xce4a6f36 + .word 0xd4ea9f09 + .word 0xd629b07c + .word 0xaf31a4b2 + .word 0x312a3f23 + .word 0x30c6a594 + .word 0xc035a266 + .word 0x37744ebc + .word 0xa6fc82ca + .word 0xb0e090d0 + .word 0x1533a7d8 + .word 0x4af10498 + .word 0xf741ecda + .word 0xe7fcd50 + .word 0x2f1791f6 + .word 0x8d764dd6 + .word 0x4d43efb0 + .word 0x54ccaa4d + .word 0xdfe49604 + .word 0xe39ed1b5 + .word 0x1b4c6a88 + .word 0xb8c12c1f + .word 0x7f466551 + .word 0x49d5eea + .word 0x5d018c35 + .word 0x73fa8774 + .word 0x2efb0b41 + .word 0x5ab3671d + .word 0x5292dbd2 + .word 0x33e91056 + .word 0x136dd647 + .word 0x8c9ad761 + .word 0x7a37a10c + .word 0x8e59f814 + .word 0x89eb133c + .word 0xeecea927 + .word 0x35b761c9 + .word 0xede11ce5 + .word 0x3c7a47b1 + .word 0x599cd2df + .word 0x3f55f273 + .word 0x791814ce + .word 0xbf73c737 + .word 0xea53f7cd + .word 0x5b5ffdaa + .word 0x14df3d6f + .word 0x867844db + .word 0x81caaff3 + .word 0x3eb968c4 + .word 0x2c382434 + .word 0x5fc2a340 + .word 0x72161dc3 + .word 0xcbce225 + .word 0x8b283c49 + .word 0x41ff0d95 + .word 0x7139a801 + .word 0xde080cb3 + .word 0x9cd8b4e4 + .word 0x906456c1 + .word 0x617bcb84 + .word 0x70d532b6 + .word 0x74486c5c + .word 0x42d0b857 +#endif /* HAVE_AES_DECRYPT */ +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + .text + .type L_AES_Thumb2_te_data, %object + .size L_AES_Thumb2_te_data, 1024 + .align 4 +L_AES_Thumb2_te_data: + .word 0xa5c66363 + .word 0x84f87c7c + .word 0x99ee7777 + .word 0x8df67b7b + .word 0xdfff2f2 + .word 0xbdd66b6b + .word 0xb1de6f6f + .word 0x5491c5c5 + .word 0x50603030 + .word 0x3020101 + .word 0xa9ce6767 + .word 0x7d562b2b + .word 0x19e7fefe + .word 0x62b5d7d7 + .word 0xe64dabab + .word 0x9aec7676 + .word 0x458fcaca + .word 0x9d1f8282 + .word 0x4089c9c9 + .word 0x87fa7d7d + .word 0x15effafa + .word 0xebb25959 + .word 0xc98e4747 + .word 0xbfbf0f0 + .word 0xec41adad + .word 0x67b3d4d4 + .word 0xfd5fa2a2 + .word 0xea45afaf + .word 0xbf239c9c + .word 0xf753a4a4 + .word 0x96e47272 + .word 0x5b9bc0c0 + .word 0xc275b7b7 + .word 0x1ce1fdfd + .word 0xae3d9393 + .word 0x6a4c2626 + .word 0x5a6c3636 + .word 0x417e3f3f + .word 0x2f5f7f7 + .word 0x4f83cccc + .word 0x5c683434 + .word 0xf451a5a5 + .word 0x34d1e5e5 + .word 0x8f9f1f1 + .word 0x93e27171 + .word 0x73abd8d8 + .word 0x53623131 + .word 0x3f2a1515 + .word 0xc080404 + .word 0x5295c7c7 + .word 0x65462323 + .word 0x5e9dc3c3 + .word 0x28301818 + .word 0xa1379696 + .word 0xf0a0505 + .word 0xb52f9a9a + .word 0x90e0707 + .word 0x36241212 + .word 0x9b1b8080 + .word 0x3ddfe2e2 + .word 0x26cdebeb + .word 0x694e2727 + .word 0xcd7fb2b2 + .word 0x9fea7575 + .word 0x1b120909 + .word 0x9e1d8383 + .word 0x74582c2c + .word 0x2e341a1a + .word 0x2d361b1b + .word 0xb2dc6e6e + .word 0xeeb45a5a + .word 0xfb5ba0a0 + .word 0xf6a45252 + .word 0x4d763b3b + .word 0x61b7d6d6 + .word 0xce7db3b3 + .word 0x7b522929 + .word 0x3edde3e3 + .word 0x715e2f2f + .word 0x97138484 + .word 0xf5a65353 + .word 0x68b9d1d1 + .word 0x0 + .word 0x2cc1eded + .word 0x60402020 + .word 0x1fe3fcfc + .word 0xc879b1b1 + .word 0xedb65b5b + .word 0xbed46a6a + .word 0x468dcbcb + .word 0xd967bebe + .word 0x4b723939 + .word 0xde944a4a + .word 0xd4984c4c + .word 0xe8b05858 + .word 0x4a85cfcf + .word 0x6bbbd0d0 + .word 0x2ac5efef + .word 0xe54faaaa + .word 0x16edfbfb + .word 0xc5864343 + .word 0xd79a4d4d + .word 0x55663333 + .word 0x94118585 + .word 0xcf8a4545 + .word 0x10e9f9f9 + .word 0x6040202 + .word 0x81fe7f7f + .word 0xf0a05050 + .word 0x44783c3c + .word 0xba259f9f + .word 0xe34ba8a8 + .word 0xf3a25151 + .word 0xfe5da3a3 + .word 0xc0804040 + .word 0x8a058f8f + .word 0xad3f9292 + .word 0xbc219d9d + .word 0x48703838 + .word 0x4f1f5f5 + .word 0xdf63bcbc + .word 0xc177b6b6 + .word 0x75afdada + .word 0x63422121 + .word 0x30201010 + .word 0x1ae5ffff + .word 0xefdf3f3 + .word 0x6dbfd2d2 + .word 0x4c81cdcd + .word 0x14180c0c + .word 0x35261313 + .word 0x2fc3ecec + .word 0xe1be5f5f + .word 0xa2359797 + .word 0xcc884444 + .word 0x392e1717 + .word 0x5793c4c4 + .word 0xf255a7a7 + .word 0x82fc7e7e + .word 0x477a3d3d + .word 0xacc86464 + .word 0xe7ba5d5d + .word 0x2b321919 + .word 0x95e67373 + .word 0xa0c06060 + .word 0x98198181 + .word 0xd19e4f4f + .word 0x7fa3dcdc + .word 0x66442222 + .word 0x7e542a2a + .word 0xab3b9090 + .word 0x830b8888 + .word 0xca8c4646 + .word 0x29c7eeee + .word 0xd36bb8b8 + .word 0x3c281414 + .word 0x79a7dede + .word 0xe2bc5e5e + .word 0x1d160b0b + .word 0x76addbdb + .word 0x3bdbe0e0 + .word 0x56643232 + .word 0x4e743a3a + .word 0x1e140a0a + .word 0xdb924949 + .word 0xa0c0606 + .word 0x6c482424 + .word 0xe4b85c5c + .word 0x5d9fc2c2 + .word 0x6ebdd3d3 + .word 0xef43acac + .word 0xa6c46262 + .word 0xa8399191 + .word 0xa4319595 + .word 0x37d3e4e4 + .word 0x8bf27979 + .word 0x32d5e7e7 + .word 0x438bc8c8 + .word 0x596e3737 + .word 0xb7da6d6d + .word 0x8c018d8d + .word 0x64b1d5d5 + .word 0xd29c4e4e + .word 0xe049a9a9 + .word 0xb4d86c6c + .word 0xfaac5656 + .word 0x7f3f4f4 + .word 0x25cfeaea + .word 0xafca6565 + .word 0x8ef47a7a + .word 0xe947aeae + .word 0x18100808 + .word 0xd56fbaba + .word 0x88f07878 + .word 0x6f4a2525 + .word 0x725c2e2e + .word 0x24381c1c + .word 0xf157a6a6 + .word 0xc773b4b4 + .word 0x5197c6c6 + .word 0x23cbe8e8 + .word 0x7ca1dddd + .word 0x9ce87474 + .word 0x213e1f1f + .word 0xdd964b4b + .word 0xdc61bdbd + .word 0x860d8b8b + .word 0x850f8a8a + .word 0x90e07070 + .word 0x427c3e3e + .word 0xc471b5b5 + .word 0xaacc6666 + .word 0xd8904848 + .word 0x5060303 + .word 0x1f7f6f6 + .word 0x121c0e0e + .word 0xa3c26161 + .word 0x5f6a3535 + .word 0xf9ae5757 + .word 0xd069b9b9 + .word 0x91178686 + .word 0x5899c1c1 + .word 0x273a1d1d + .word 0xb9279e9e + .word 0x38d9e1e1 + .word 0x13ebf8f8 + .word 0xb32b9898 + .word 0x33221111 + .word 0xbbd26969 + .word 0x70a9d9d9 + .word 0x89078e8e + .word 0xa7339494 + .word 0xb62d9b9b + .word 0x223c1e1e + .word 0x92158787 + .word 0x20c9e9e9 + .word 0x4987cece + .word 0xffaa5555 + .word 0x78502828 + .word 0x7aa5dfdf + .word 0x8f038c8c + .word 0xf859a1a1 + .word 0x80098989 + .word 0x171a0d0d + .word 0xda65bfbf + .word 0x31d7e6e6 + .word 0xc6844242 + .word 0xb8d06868 + .word 0xc3824141 + .word 0xb0299999 + .word 0x775a2d2d + .word 0x111e0f0f + .word 0xcb7bb0b0 + .word 0xfca85454 + .word 0xd66dbbbb + .word 0x3a2c1616 +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT + .text + .type L_AES_Thumb2_td, %object + .size L_AES_Thumb2_td, 12 + .align 4 +L_AES_Thumb2_td: + .word L_AES_Thumb2_td_data +#endif /* HAVE_AES_DECRYPT */ +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + .text + .type L_AES_Thumb2_te, %object + .size L_AES_Thumb2_te, 12 + .align 4 +L_AES_Thumb2_te: + .word L_AES_Thumb2_te_data +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT + .text + .align 4 + .globl AES_invert_key + .type AES_invert_key, %function +AES_invert_key: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + LDR r12, L_AES_Thumb2_te + LDR lr, L_AES_Thumb2_td + ADD r10, r0, r1, LSL #4 + MOV r11, r1 +L_AES_invert_key_loop: + LDM r0, {r2, r3, r4, r5} + LDM r10, {r6, r7, r8, r9} + STM r10, {r2, r3, r4, r5} + STM r0!, {r6, r7, r8, r9} + SUBS r11, r11, #0x2 + SUB r10, r10, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_invert_key_loop +#else + BNE.N L_AES_invert_key_loop +#endif + SUB r0, r0, r1, LSL #3 + ADD r0, r0, #0x10 + SUB r11, r1, #0x1 +L_AES_invert_key_mix_loop: + LDM r0, {r2, r3, r4, r5} + UBFX r6, r2, #0, #8 + UBFX r7, r2, #8, #8 + UBFX r8, r2, #16, #8 + LSR r9, r2, #24 + LDRB r6, [r12, r6, LSL #2] + LDRB r7, [r12, r7, LSL #2] + LDRB r8, [r12, r8, LSL #2] + LDRB r9, [r12, r9, LSL #2] + LDR r6, [lr, r6, LSL #2] + LDR r7, [lr, r7, LSL #2] + LDR r8, [lr, r8, LSL #2] + LDR r9, [lr, r9, LSL #2] + EOR r8, r8, r6, ROR #16 + EOR r8, r8, r7, ROR #8 + EOR r8, r8, r9, ROR #24 + STR r8, [r0], #4 + UBFX r6, r3, #0, #8 + UBFX r7, r3, #8, #8 + UBFX r8, r3, #16, #8 + LSR r9, r3, #24 + LDRB r6, [r12, r6, LSL #2] + LDRB r7, [r12, r7, LSL #2] + LDRB r8, [r12, r8, LSL #2] + LDRB r9, [r12, r9, LSL #2] + LDR r6, [lr, r6, LSL #2] + LDR r7, [lr, r7, LSL #2] + LDR r8, [lr, r8, LSL #2] + LDR r9, [lr, r9, LSL #2] + EOR r8, r8, r6, ROR #16 + EOR r8, r8, r7, ROR #8 + EOR r8, r8, r9, ROR #24 + STR r8, [r0], #4 + UBFX r6, r4, #0, #8 + UBFX r7, r4, #8, #8 + UBFX r8, r4, #16, #8 + LSR r9, r4, #24 + LDRB r6, [r12, r6, LSL #2] + LDRB r7, [r12, r7, LSL #2] + LDRB r8, [r12, r8, LSL #2] + LDRB r9, [r12, r9, LSL #2] + LDR r6, [lr, r6, LSL #2] + LDR r7, [lr, r7, LSL #2] + LDR r8, [lr, r8, LSL #2] + LDR r9, [lr, r9, LSL #2] + EOR r8, r8, r6, ROR #16 + EOR r8, r8, r7, ROR #8 + EOR r8, r8, r9, ROR #24 + STR r8, [r0], #4 + UBFX r6, r5, #0, #8 + UBFX r7, r5, #8, #8 + UBFX r8, r5, #16, #8 + LSR r9, r5, #24 + LDRB r6, [r12, r6, LSL #2] + LDRB r7, [r12, r7, LSL #2] + LDRB r8, [r12, r8, LSL #2] + LDRB r9, [r12, r9, LSL #2] + LDR r6, [lr, r6, LSL #2] + LDR r7, [lr, r7, LSL #2] + LDR r8, [lr, r8, LSL #2] + LDR r9, [lr, r9, LSL #2] + EOR r8, r8, r6, ROR #16 + EOR r8, r8, r7, ROR #8 + EOR r8, r8, r9, ROR #24 + STR r8, [r0], #4 + SUBS r11, r11, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_invert_key_mix_loop +#else + BNE.W L_AES_invert_key_mix_loop +#endif + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 165 */ + .size AES_invert_key,.-AES_invert_key +#endif /* HAVE_AES_DECRYPT */ + .text + .type L_AES_Thumb2_rcon, %object + .size L_AES_Thumb2_rcon, 40 + .align 4 +L_AES_Thumb2_rcon: + .word 0x1000000 + .word 0x2000000 + .word 0x4000000 + .word 0x8000000 + .word 0x10000000 + .word 0x20000000 + .word 0x40000000 + .word 0x80000000 + .word 0x1b000000 + .word 0x36000000 + .text + .align 4 + .globl AES_set_encrypt_key + .type AES_set_encrypt_key, %function +AES_set_encrypt_key: + PUSH {r4, r5, r6, r7, r8, r9, r10, lr} + LDR r10, L_AES_Thumb2_te + ADR lr, L_AES_Thumb2_rcon + CMP r1, #0x80 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_set_encrypt_key_start_128 +#else + BEQ.W L_AES_set_encrypt_key_start_128 +#endif + CMP r1, #0xc0 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_set_encrypt_key_start_192 +#else + BEQ.W L_AES_set_encrypt_key_start_192 +#endif + LDR r4, [r0] + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r2!, {r4, r5, r6, r7} + LDR r4, [r0, #16] + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r2, {r4, r5, r6, r7} + SUB r2, r2, #0x10 + MOV r12, #0x6 +L_AES_set_encrypt_key_loop_256: + UBFX r4, r7, #0, #8 + UBFX r5, r7, #8, #8 + UBFX r6, r7, #16, #8 + LSR r7, r7, #24 + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r7, [r10, r7, LSL #2] + EOR r3, r7, r4, LSL #8 + EOR r3, r3, r5, LSL #16 + EOR r3, r3, r6, LSL #24 + LDM r2!, {r4, r5, r6, r7} + EOR r4, r4, r3 + LDM lr!, {r3} + EOR r4, r4, r3 + EOR r5, r5, r4 + EOR r6, r6, r5 + EOR r7, r7, r6 + ADD r2, r2, #0x10 + STM r2, {r4, r5, r6, r7} + SUB r2, r2, #0x10 + MOV r3, r7 + UBFX r4, r3, #8, #8 + UBFX r5, r3, #16, #8 + LSR r6, r3, #24 + UBFX r3, r3, #0, #8 + LDRB r4, [r10, r4, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r3, [r10, r3, LSL #2] + EOR r3, r3, r4, LSL #8 + EOR r3, r3, r5, LSL #16 + EOR r3, r3, r6, LSL #24 + LDM r2!, {r4, r5, r6, r7} + EOR r4, r4, r3 + EOR r5, r5, r4 + EOR r6, r6, r5 + EOR r7, r7, r6 + ADD r2, r2, #0x10 + STM r2, {r4, r5, r6, r7} + SUB r2, r2, #0x10 + SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_set_encrypt_key_loop_256 +#else + BNE.N L_AES_set_encrypt_key_loop_256 +#endif + UBFX r4, r7, #0, #8 + UBFX r5, r7, #8, #8 + UBFX r6, r7, #16, #8 + LSR r7, r7, #24 + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r7, [r10, r7, LSL #2] + EOR r3, r7, r4, LSL #8 + EOR r3, r3, r5, LSL #16 + EOR r3, r3, r6, LSL #24 + LDM r2!, {r4, r5, r6, r7} + EOR r4, r4, r3 + LDM lr!, {r3} + EOR r4, r4, r3 + EOR r5, r5, r4 + EOR r6, r6, r5 + EOR r7, r7, r6 + ADD r2, r2, #0x10 + STM r2, {r4, r5, r6, r7} + SUB r2, r2, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_set_encrypt_key_end +#else + B.N L_AES_set_encrypt_key_end +#endif +L_AES_set_encrypt_key_start_192: + LDR r4, [r0] + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + LDR r9, [r0, #20] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + STM r2, {r4, r5, r6, r7} + STRD r8, r9, [r2, #16] + MOV r7, r9 + MOV r12, #0x7 +L_AES_set_encrypt_key_loop_192: + UBFX r4, r9, #0, #8 + UBFX r5, r9, #8, #8 + UBFX r6, r9, #16, #8 + LSR r9, r9, #24 + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r9, [r10, r9, LSL #2] + EOR r3, r9, r4, LSL #8 + EOR r3, r3, r5, LSL #16 + EOR r3, r3, r6, LSL #24 + LDM r2!, {r4, r5, r6, r7, r8, r9} + EOR r4, r4, r3 + LDM lr!, {r3} + EOR r4, r4, r3 + EOR r5, r5, r4 + EOR r6, r6, r5 + EOR r7, r7, r6 + EOR r8, r8, r7 + EOR r9, r9, r8 + STM r2, {r4, r5, r6, r7, r8, r9} + SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_set_encrypt_key_loop_192 +#else + BNE.N L_AES_set_encrypt_key_loop_192 +#endif + UBFX r4, r9, #0, #8 + UBFX r5, r9, #8, #8 + UBFX r6, r9, #16, #8 + LSR r9, r9, #24 + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r9, [r10, r9, LSL #2] + EOR r3, r9, r4, LSL #8 + EOR r3, r3, r5, LSL #16 + EOR r3, r3, r6, LSL #24 + LDM r2!, {r4, r5, r6, r7, r8, r9} + EOR r4, r4, r3 + LDM lr!, {r3} + EOR r4, r4, r3 + EOR r5, r5, r4 + EOR r6, r6, r5 + EOR r7, r7, r6 + STM r2, {r4, r5, r6, r7} +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_set_encrypt_key_end +#else + B.N L_AES_set_encrypt_key_end +#endif +L_AES_set_encrypt_key_start_128: + LDR r4, [r0] + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r2, {r4, r5, r6, r7} + MOV r12, #0xa +L_AES_set_encrypt_key_loop_128: + UBFX r4, r7, #0, #8 + UBFX r5, r7, #8, #8 + UBFX r6, r7, #16, #8 + LSR r7, r7, #24 + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r7, [r10, r7, LSL #2] + EOR r3, r7, r4, LSL #8 + EOR r3, r3, r5, LSL #16 + EOR r3, r3, r6, LSL #24 + LDM r2!, {r4, r5, r6, r7} + EOR r4, r4, r3 + LDM lr!, {r3} + EOR r4, r4, r3 + EOR r5, r5, r4 + EOR r6, r6, r5 + EOR r7, r7, r6 + STM r2, {r4, r5, r6, r7} + SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_set_encrypt_key_loop_128 +#else + BNE.N L_AES_set_encrypt_key_loop_128 +#endif +L_AES_set_encrypt_key_end: + POP {r4, r5, r6, r7, r8, r9, r10, pc} + /* Cycle Count = 340 */ + .size AES_set_encrypt_key,.-AES_set_encrypt_key + .text + .align 4 + .globl AES_encrypt_block + .type AES_encrypt_block, %function +AES_encrypt_block: + PUSH {lr} +L_AES_encrypt_block_nr: + UBFX r8, r5, #16, #8 + LSR r11, r4, #24 + UBFX lr, r6, #8, #8 + UBFX r2, r7, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r9, r6, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, lr, ROR #8 + UBFX lr, r7, #8, #8 + EOR r8, r8, r2, ROR #16 + UBFX r2, r4, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r7, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, lr, ROR #8 + UBFX lr, r4, #8, #8 + EOR r9, r9, r2, ROR #16 + UBFX r2, r5, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r6, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r4, #16, #8 + EOR r10, r10, lr, ROR #8 + LSR lr, r7, #24 + EOR r10, r10, r2, ROR #16 + UBFX r2, r5, #8, #8 + LDR r6, [r0, r6, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r6, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #24 + EOR r11, r11, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r9, #16, #8 + LSR r7, r8, #24 + UBFX lr, r10, #8, #8 + UBFX r2, r11, #0, #8 + LDR r4, [r0, r4, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r5, r10, #16, #8 + EOR r4, r4, r7, ROR #24 + LSR r7, r9, #24 + EOR r4, r4, lr, ROR #8 + UBFX lr, r11, #8, #8 + EOR r4, r4, r2, ROR #16 + UBFX r2, r8, #0, #8 + LDR r5, [r0, r5, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r11, #16, #8 + EOR r5, r5, r7, ROR #24 + LSR r7, r10, #24 + EOR r5, r5, lr, ROR #8 + UBFX lr, r8, #8, #8 + EOR r5, r5, r2, ROR #16 + UBFX r2, r9, #0, #8 + LDR r6, [r0, r6, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r10, #0, #8 + EOR r6, r6, r7, ROR #24 + UBFX r7, r8, #16, #8 + EOR r6, r6, lr, ROR #8 + LSR lr, r11, #24 + EOR r6, r6, r2, ROR #16 + UBFX r2, r9, #8, #8 + LDR r10, [r0, r10, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r10, ROR #24 + LDM r3!, {r8, r9, r10, r11} + EOR r7, r7, lr, ROR #24 + EOR r7, r7, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + SUBS r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_encrypt_block_nr +#else + BNE.W L_AES_encrypt_block_nr +#endif + UBFX r8, r5, #16, #8 + LSR r11, r4, #24 + UBFX lr, r6, #8, #8 + UBFX r2, r7, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r9, r6, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, lr, ROR #8 + UBFX lr, r7, #8, #8 + EOR r8, r8, r2, ROR #16 + UBFX r2, r4, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r10, r7, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, lr, ROR #8 + UBFX lr, r4, #8, #8 + EOR r9, r9, r2, ROR #16 + UBFX r2, r5, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r2, [r0, r2, LSL #2] + UBFX r6, r6, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r4, #16, #8 + EOR r10, r10, lr, ROR #8 + LSR lr, r7, #24 + EOR r10, r10, r2, ROR #16 + UBFX r2, r5, #8, #8 + LDR r6, [r0, r6, LSL #2] + LDR lr, [r0, lr, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r2, [r0, r2, LSL #2] + EOR lr, lr, r6, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #24 + EOR r11, r11, r2, ROR #8 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r11, #0, #8 + UBFX r7, r10, #8, #8 + UBFX lr, r9, #16, #8 + LSR r2, r8, #24 + LDRB r4, [r0, r4, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + UBFX r5, r8, #0, #8 + EOR r4, r4, r7, LSL #8 + UBFX r7, r11, #8, #8 + EOR r4, r4, lr, LSL #16 + UBFX lr, r10, #16, #8 + EOR r4, r4, r2, LSL #24 + LSR r2, r9, #24 + LDRB r5, [r0, r5, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + UBFX r6, r9, #0, #8 + EOR r5, r5, r7, LSL #8 + UBFX r7, r8, #8, #8 + EOR r5, r5, lr, LSL #16 + UBFX lr, r11, #16, #8 + EOR r5, r5, r2, LSL #24 + LSR r2, r10, #24 + LDRB r6, [r0, r6, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + LSR r11, r11, #24 + EOR r6, r6, r7, LSL #8 + UBFX r7, r10, #0, #8 + EOR r6, r6, lr, LSL #16 + UBFX lr, r9, #8, #8 + EOR r6, r6, r2, LSL #24 + UBFX r2, r8, #16, #8 + LDRB r11, [r0, r11, LSL #2] + LDRB r7, [r0, r7, LSL #2] + LDRB lr, [r0, lr, LSL #2] + LDRB r2, [r0, r2, LSL #2] + EOR lr, lr, r11, LSL #16 + LDM r3, {r8, r9, r10, r11} + EOR r7, r7, lr, LSL #8 + EOR r7, r7, r2, LSL #16 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + POP {pc} + /* Cycle Count = 285 */ + .size AES_encrypt_block,.-AES_encrypt_block +#if defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + .text + .type L_AES_Thumb2_te_ecb, %object + .size L_AES_Thumb2_te_ecb, 12 + .align 4 +L_AES_Thumb2_te_ecb: + .word L_AES_Thumb2_te_data +#endif /* HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + .text + .align 4 + .globl AES_ECB_encrypt + .type AES_ECB_encrypt, %function +AES_ECB_encrypt: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + MOV lr, r0 + LDR r0, L_AES_Thumb2_te_ecb + LDR r12, [sp, #36] + PUSH {r3} + CMP r12, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_ECB_encrypt_start_block_128 +#else + BEQ.W L_AES_ECB_encrypt_start_block_128 +#endif + CMP r12, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_ECB_encrypt_start_block_192 +#else + BEQ.W L_AES_ECB_encrypt_start_block_192 +#endif +L_AES_ECB_encrypt_loop_block_256: + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + PUSH {r1, r2, lr} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x6 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_ECB_encrypt_loop_block_256 +#else + BNE.W L_AES_ECB_encrypt_loop_block_256 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_ECB_encrypt_end +#else + B.N L_AES_ECB_encrypt_end +#endif +L_AES_ECB_encrypt_start_block_192: +L_AES_ECB_encrypt_loop_block_192: + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + PUSH {r1, r2, lr} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x5 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_ECB_encrypt_loop_block_192 +#else + BNE.W L_AES_ECB_encrypt_loop_block_192 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_ECB_encrypt_end +#else + B.N L_AES_ECB_encrypt_end +#endif +L_AES_ECB_encrypt_start_block_128: +L_AES_ECB_encrypt_loop_block_128: + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + PUSH {r1, r2, lr} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x4 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_ECB_encrypt_loop_block_128 +#else + BNE.W L_AES_ECB_encrypt_loop_block_128 +#endif +L_AES_ECB_encrypt_end: + POP {r3} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 212 */ + .size AES_ECB_encrypt,.-AES_ECB_encrypt +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_CBC + .text + .align 4 + .globl AES_CBC_encrypt + .type AES_CBC_encrypt, %function +AES_CBC_encrypt: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + LDR r8, [sp, #36] + LDR r9, [sp, #40] + MOV lr, r0 + LDR r0, L_AES_Thumb2_te_ecb + LDM r9, {r4, r5, r6, r7} + PUSH {r3, r9} + CMP r8, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CBC_encrypt_start_block_128 +#else + BEQ.W L_AES_CBC_encrypt_start_block_128 +#endif + CMP r8, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CBC_encrypt_start_block_192 +#else + BEQ.W L_AES_CBC_encrypt_start_block_192 +#endif +L_AES_CBC_encrypt_loop_block_256: + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + PUSH {r1, r2, lr} + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x6 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CBC_encrypt_loop_block_256 +#else + BNE.W L_AES_CBC_encrypt_loop_block_256 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_CBC_encrypt_end +#else + B.N L_AES_CBC_encrypt_end +#endif +L_AES_CBC_encrypt_start_block_192: +L_AES_CBC_encrypt_loop_block_192: + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + PUSH {r1, r2, lr} + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x5 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CBC_encrypt_loop_block_192 +#else + BNE.W L_AES_CBC_encrypt_loop_block_192 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_CBC_encrypt_end +#else + B.N L_AES_CBC_encrypt_end +#endif +L_AES_CBC_encrypt_start_block_128: +L_AES_CBC_encrypt_loop_block_128: + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + PUSH {r1, r2, lr} + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x4 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CBC_encrypt_loop_block_128 +#else + BNE.W L_AES_CBC_encrypt_loop_block_128 +#endif +L_AES_CBC_encrypt_end: + POP {r3, r9} + STM r9, {r4, r5, r6, r7} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 238 */ + .size AES_CBC_encrypt,.-AES_CBC_encrypt +#endif /* HAVE_AES_CBC */ +#ifdef WOLFSSL_AES_COUNTER + .text + .align 4 + .globl AES_CTR_encrypt + .type AES_CTR_encrypt, %function +AES_CTR_encrypt: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + LDR r12, [sp, #36] + LDR r8, [sp, #40] + MOV lr, r0 + LDR r0, L_AES_Thumb2_te_ecb + LDM r8, {r4, r5, r6, r7} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r8, {r4, r5, r6, r7} + PUSH {r3, r8} + CMP r12, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CTR_encrypt_start_block_128 +#else + BEQ.W L_AES_CTR_encrypt_start_block_128 +#endif + CMP r12, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CTR_encrypt_start_block_192 +#else + BEQ.W L_AES_CTR_encrypt_start_block_192 +#endif +L_AES_CTR_encrypt_loop_block_256: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + ADDS r11, r7, #0x1 + ADCS r10, r6, #0x0 + ADCS r9, r5, #0x0 + ADC r8, r4, #0x0 + STM lr, {r8, r9, r10, r11} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x6 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CTR_encrypt_loop_block_256 +#else + BNE.W L_AES_CTR_encrypt_loop_block_256 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_CTR_encrypt_end +#else + B.W L_AES_CTR_encrypt_end +#endif +L_AES_CTR_encrypt_start_block_192: +L_AES_CTR_encrypt_loop_block_192: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + ADDS r11, r7, #0x1 + ADCS r10, r6, #0x0 + ADCS r9, r5, #0x0 + ADC r8, r4, #0x0 + STM lr, {r8, r9, r10, r11} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x5 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CTR_encrypt_loop_block_192 +#else + BNE.W L_AES_CTR_encrypt_loop_block_192 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_CTR_encrypt_end +#else + B.W L_AES_CTR_encrypt_end +#endif +L_AES_CTR_encrypt_start_block_128: +L_AES_CTR_encrypt_loop_block_128: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + ADDS r11, r7, #0x1 + ADCS r10, r6, #0x0 + ADCS r9, r5, #0x0 + ADC r8, r4, #0x0 + STM lr, {r8, r9, r10, r11} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x4 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CTR_encrypt_loop_block_128 +#else + BNE.W L_AES_CTR_encrypt_loop_block_128 +#endif +L_AES_CTR_encrypt_end: + POP {r3, r8} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r8, {r4, r5, r6, r7} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 293 */ + .size AES_CTR_encrypt,.-AES_CTR_encrypt +#endif /* WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC) + .text + .align 4 + .globl AES_decrypt_block + .type AES_decrypt_block, %function +AES_decrypt_block: + PUSH {lr} +L_AES_decrypt_block_nr: + UBFX r8, r7, #16, #8 + LSR r11, r4, #24 + UBFX r12, r6, #8, #8 + UBFX lr, r5, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r9, r4, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, r12, ROR #8 + UBFX r12, r7, #8, #8 + EOR r8, r8, lr, ROR #16 + UBFX lr, r6, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r10, r5, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, r12, ROR #8 + UBFX r12, r4, #8, #8 + EOR r9, r9, lr, ROR #16 + UBFX lr, r7, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r4, r4, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r6, #16, #8 + EOR r10, r10, r12, ROR #8 + LSR r12, r7, #24 + EOR r10, r10, lr, ROR #16 + UBFX lr, r5, #8, #8 + LDR r4, [r0, r4, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + EOR r12, r12, r4, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #8 + EOR r11, r11, r12, ROR #24 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r11, #16, #8 + LSR r7, r8, #24 + UBFX r12, r10, #8, #8 + UBFX lr, r9, #0, #8 + LDR r4, [r0, r4, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r5, r8, #16, #8 + EOR r4, r4, r7, ROR #24 + LSR r7, r9, #24 + EOR r4, r4, r12, ROR #8 + UBFX r12, r11, #8, #8 + EOR r4, r4, lr, ROR #16 + UBFX lr, r10, #0, #8 + LDR r5, [r0, r5, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r6, r9, #16, #8 + EOR r5, r5, r7, ROR #24 + LSR r7, r10, #24 + EOR r5, r5, r12, ROR #8 + UBFX r12, r8, #8, #8 + EOR r5, r5, lr, ROR #16 + UBFX lr, r11, #0, #8 + LDR r6, [r0, r6, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r8, r8, #0, #8 + EOR r6, r6, r7, ROR #24 + UBFX r7, r10, #16, #8 + EOR r6, r6, r12, ROR #8 + LSR r12, r11, #24 + EOR r6, r6, lr, ROR #16 + UBFX lr, r9, #8, #8 + LDR r8, [r0, r8, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR r7, [r0, r7, LSL #2] + LDR lr, [r0, lr, LSL #2] + EOR r12, r12, r8, ROR #24 + LDM r3!, {r8, r9, r10, r11} + EOR r7, r7, lr, ROR #8 + EOR r7, r7, r12, ROR #24 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + SUBS r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_decrypt_block_nr +#else + BNE.W L_AES_decrypt_block_nr +#endif + UBFX r8, r7, #16, #8 + LSR r11, r4, #24 + UBFX r12, r6, #8, #8 + UBFX lr, r5, #0, #8 + LDR r8, [r0, r8, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r9, r4, #16, #8 + EOR r8, r8, r11, ROR #24 + LSR r11, r5, #24 + EOR r8, r8, r12, ROR #8 + UBFX r12, r7, #8, #8 + EOR r8, r8, lr, ROR #16 + UBFX lr, r6, #0, #8 + LDR r9, [r0, r9, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r10, r5, #16, #8 + EOR r9, r9, r11, ROR #24 + LSR r11, r6, #24 + EOR r9, r9, r12, ROR #8 + UBFX r12, r4, #8, #8 + EOR r9, r9, lr, ROR #16 + UBFX lr, r7, #0, #8 + LDR r10, [r0, r10, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR lr, [r0, lr, LSL #2] + UBFX r4, r4, #0, #8 + EOR r10, r10, r11, ROR #24 + UBFX r11, r6, #16, #8 + EOR r10, r10, r12, ROR #8 + LSR r12, r7, #24 + EOR r10, r10, lr, ROR #16 + UBFX lr, r5, #8, #8 + LDR r4, [r0, r4, LSL #2] + LDR r12, [r0, r12, LSL #2] + LDR r11, [r0, r11, LSL #2] + LDR lr, [r0, lr, LSL #2] + EOR r12, r12, r4, ROR #24 + LDM r3!, {r4, r5, r6, r7} + EOR r11, r11, lr, ROR #8 + EOR r11, r11, r12, ROR #24 + /* XOR in Key Schedule */ + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + UBFX r4, r9, #0, #8 + UBFX r7, r10, #8, #8 + UBFX r12, r11, #16, #8 + LSR lr, r8, #24 + LDRB r4, [r2, r4] + LDRB r7, [r2, r7] + LDRB r12, [r2, r12] + LDRB lr, [r2, lr] + UBFX r5, r10, #0, #8 + EOR r4, r4, r7, LSL #8 + UBFX r7, r11, #8, #8 + EOR r4, r4, r12, LSL #16 + UBFX r12, r8, #16, #8 + EOR r4, r4, lr, LSL #24 + LSR lr, r9, #24 + LDRB r7, [r2, r7] + LDRB lr, [r2, lr] + LDRB r5, [r2, r5] + LDRB r12, [r2, r12] + UBFX r6, r11, #0, #8 + EOR r5, r5, r7, LSL #8 + UBFX r7, r8, #8, #8 + EOR r5, r5, r12, LSL #16 + UBFX r12, r9, #16, #8 + EOR r5, r5, lr, LSL #24 + LSR lr, r10, #24 + LDRB r7, [r2, r7] + LDRB lr, [r2, lr] + LDRB r6, [r2, r6] + LDRB r12, [r2, r12] + LSR r11, r11, #24 + EOR r6, r6, r7, LSL #8 + UBFX r7, r8, #0, #8 + EOR r6, r6, r12, LSL #16 + UBFX r12, r9, #8, #8 + EOR r6, r6, lr, LSL #24 + UBFX lr, r10, #16, #8 + LDRB r11, [r2, r11] + LDRB r12, [r2, r12] + LDRB r7, [r2, r7] + LDRB lr, [r2, lr] + EOR r12, r12, r11, LSL #16 + LDM r3, {r8, r9, r10, r11} + EOR r7, r7, r12, LSL #8 + EOR r7, r7, lr, LSL #16 + /* XOR in Key Schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + POP {pc} + /* Cycle Count = 285 */ + .size AES_decrypt_block,.-AES_decrypt_block + .text + .type L_AES_Thumb2_td_ecb, %object + .size L_AES_Thumb2_td_ecb, 12 + .align 4 +L_AES_Thumb2_td_ecb: + .word L_AES_Thumb2_td_data + .text + .type L_AES_Thumb2_td4, %object + .size L_AES_Thumb2_td4, 256 + .align 4 +L_AES_Thumb2_td4: + .byte 0x52 + .byte 0x9 + .byte 0x6a + .byte 0xd5 + .byte 0x30 + .byte 0x36 + .byte 0xa5 + .byte 0x38 + .byte 0xbf + .byte 0x40 + .byte 0xa3 + .byte 0x9e + .byte 0x81 + .byte 0xf3 + .byte 0xd7 + .byte 0xfb + .byte 0x7c + .byte 0xe3 + .byte 0x39 + .byte 0x82 + .byte 0x9b + .byte 0x2f + .byte 0xff + .byte 0x87 + .byte 0x34 + .byte 0x8e + .byte 0x43 + .byte 0x44 + .byte 0xc4 + .byte 0xde + .byte 0xe9 + .byte 0xcb + .byte 0x54 + .byte 0x7b + .byte 0x94 + .byte 0x32 + .byte 0xa6 + .byte 0xc2 + .byte 0x23 + .byte 0x3d + .byte 0xee + .byte 0x4c + .byte 0x95 + .byte 0xb + .byte 0x42 + .byte 0xfa + .byte 0xc3 + .byte 0x4e + .byte 0x8 + .byte 0x2e + .byte 0xa1 + .byte 0x66 + .byte 0x28 + .byte 0xd9 + .byte 0x24 + .byte 0xb2 + .byte 0x76 + .byte 0x5b + .byte 0xa2 + .byte 0x49 + .byte 0x6d + .byte 0x8b + .byte 0xd1 + .byte 0x25 + .byte 0x72 + .byte 0xf8 + .byte 0xf6 + .byte 0x64 + .byte 0x86 + .byte 0x68 + .byte 0x98 + .byte 0x16 + .byte 0xd4 + .byte 0xa4 + .byte 0x5c + .byte 0xcc + .byte 0x5d + .byte 0x65 + .byte 0xb6 + .byte 0x92 + .byte 0x6c + .byte 0x70 + .byte 0x48 + .byte 0x50 + .byte 0xfd + .byte 0xed + .byte 0xb9 + .byte 0xda + .byte 0x5e + .byte 0x15 + .byte 0x46 + .byte 0x57 + .byte 0xa7 + .byte 0x8d + .byte 0x9d + .byte 0x84 + .byte 0x90 + .byte 0xd8 + .byte 0xab + .byte 0x0 + .byte 0x8c + .byte 0xbc + .byte 0xd3 + .byte 0xa + .byte 0xf7 + .byte 0xe4 + .byte 0x58 + .byte 0x5 + .byte 0xb8 + .byte 0xb3 + .byte 0x45 + .byte 0x6 + .byte 0xd0 + .byte 0x2c + .byte 0x1e + .byte 0x8f + .byte 0xca + .byte 0x3f + .byte 0xf + .byte 0x2 + .byte 0xc1 + .byte 0xaf + .byte 0xbd + .byte 0x3 + .byte 0x1 + .byte 0x13 + .byte 0x8a + .byte 0x6b + .byte 0x3a + .byte 0x91 + .byte 0x11 + .byte 0x41 + .byte 0x4f + .byte 0x67 + .byte 0xdc + .byte 0xea + .byte 0x97 + .byte 0xf2 + .byte 0xcf + .byte 0xce + .byte 0xf0 + .byte 0xb4 + .byte 0xe6 + .byte 0x73 + .byte 0x96 + .byte 0xac + .byte 0x74 + .byte 0x22 + .byte 0xe7 + .byte 0xad + .byte 0x35 + .byte 0x85 + .byte 0xe2 + .byte 0xf9 + .byte 0x37 + .byte 0xe8 + .byte 0x1c + .byte 0x75 + .byte 0xdf + .byte 0x6e + .byte 0x47 + .byte 0xf1 + .byte 0x1a + .byte 0x71 + .byte 0x1d + .byte 0x29 + .byte 0xc5 + .byte 0x89 + .byte 0x6f + .byte 0xb7 + .byte 0x62 + .byte 0xe + .byte 0xaa + .byte 0x18 + .byte 0xbe + .byte 0x1b + .byte 0xfc + .byte 0x56 + .byte 0x3e + .byte 0x4b + .byte 0xc6 + .byte 0xd2 + .byte 0x79 + .byte 0x20 + .byte 0x9a + .byte 0xdb + .byte 0xc0 + .byte 0xfe + .byte 0x78 + .byte 0xcd + .byte 0x5a + .byte 0xf4 + .byte 0x1f + .byte 0xdd + .byte 0xa8 + .byte 0x33 + .byte 0x88 + .byte 0x7 + .byte 0xc7 + .byte 0x31 + .byte 0xb1 + .byte 0x12 + .byte 0x10 + .byte 0x59 + .byte 0x27 + .byte 0x80 + .byte 0xec + .byte 0x5f + .byte 0x60 + .byte 0x51 + .byte 0x7f + .byte 0xa9 + .byte 0x19 + .byte 0xb5 + .byte 0x4a + .byte 0xd + .byte 0x2d + .byte 0xe5 + .byte 0x7a + .byte 0x9f + .byte 0x93 + .byte 0xc9 + .byte 0x9c + .byte 0xef + .byte 0xa0 + .byte 0xe0 + .byte 0x3b + .byte 0x4d + .byte 0xae + .byte 0x2a + .byte 0xf5 + .byte 0xb0 + .byte 0xc8 + .byte 0xeb + .byte 0xbb + .byte 0x3c + .byte 0x83 + .byte 0x53 + .byte 0x99 + .byte 0x61 + .byte 0x17 + .byte 0x2b + .byte 0x4 + .byte 0x7e + .byte 0xba + .byte 0x77 + .byte 0xd6 + .byte 0x26 + .byte 0xe1 + .byte 0x69 + .byte 0x14 + .byte 0x63 + .byte 0x55 + .byte 0x21 + .byte 0xc + .byte 0x7d +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) + .text + .align 4 + .globl AES_ECB_decrypt + .type AES_ECB_decrypt, %function +AES_ECB_decrypt: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + LDR r8, [sp, #36] + MOV lr, r0 + LDR r0, L_AES_Thumb2_td_ecb + MOV r12, r2 + ADR r2, L_AES_Thumb2_td4 + CMP r8, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_ECB_decrypt_start_block_128 +#else + BEQ.W L_AES_ECB_decrypt_start_block_128 +#endif + CMP r8, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_ECB_decrypt_start_block_192 +#else + BEQ.W L_AES_ECB_decrypt_start_block_192 +#endif +L_AES_ECB_decrypt_loop_block_256: + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + PUSH {r1, r3, r12, lr} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x6 + BL AES_decrypt_block + POP {r1, r3, r12, lr} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_ECB_decrypt_loop_block_256 +#else + BNE.W L_AES_ECB_decrypt_loop_block_256 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_ECB_decrypt_end +#else + B.N L_AES_ECB_decrypt_end +#endif +L_AES_ECB_decrypt_start_block_192: +L_AES_ECB_decrypt_loop_block_192: + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + PUSH {r1, r3, r12, lr} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x5 + BL AES_decrypt_block + POP {r1, r3, r12, lr} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_ECB_decrypt_loop_block_192 +#else + BNE.W L_AES_ECB_decrypt_loop_block_192 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_ECB_decrypt_end +#else + B.N L_AES_ECB_decrypt_end +#endif +L_AES_ECB_decrypt_start_block_128: +L_AES_ECB_decrypt_loop_block_128: + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + PUSH {r1, r3, r12, lr} + LDM r3!, {r8, r9, r10, r11} + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x4 + BL AES_decrypt_block + POP {r1, r3, r12, lr} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_ECB_decrypt_loop_block_128 +#else + BNE.W L_AES_ECB_decrypt_loop_block_128 +#endif +L_AES_ECB_decrypt_end: + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 210 */ + .size AES_ECB_decrypt,.-AES_ECB_decrypt +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_CBC + .text + .align 4 + .globl AES_CBC_decrypt + .type AES_CBC_decrypt, %function +AES_CBC_decrypt: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + LDR r8, [sp, #36] + LDR r4, [sp, #40] + MOV lr, r0 + LDR r0, L_AES_Thumb2_td_ecb + MOV r12, r2 + ADR r2, L_AES_Thumb2_td4 + PUSH {r3, r4} + CMP r8, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CBC_decrypt_loop_block_128 +#else + BEQ.W L_AES_CBC_decrypt_loop_block_128 +#endif + CMP r8, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CBC_decrypt_loop_block_192 +#else + BEQ.W L_AES_CBC_decrypt_loop_block_192 +#endif +L_AES_CBC_decrypt_loop_block_256: + PUSH {r1, r12, lr} + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + LDR lr, [sp, #16] + STRD r4, r5, [lr, #16] + STRD r6, r7, [lr, #24] + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x6 + BL AES_decrypt_block + LDR lr, [sp, #16] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDM lr, {r8, r9, r10, r11} + POP {r1, r12, lr} + LDR r3, [sp] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CBC_decrypt_end_odd +#else + BEQ.W L_AES_CBC_decrypt_end_odd +#endif + PUSH {r1, r12, lr} + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + LDR lr, [sp, #16] + STRD r4, r5, [lr] + STRD r6, r7, [lr, #8] + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x6 + BL AES_decrypt_block + LDR lr, [sp, #16] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDRD r8, r9, [lr, #16] + LDRD r10, r11, [lr, #24] + POP {r1, r12, lr} + LDR r3, [sp] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CBC_decrypt_loop_block_256 +#else + BNE.W L_AES_CBC_decrypt_loop_block_256 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_CBC_decrypt_end +#else + B.W L_AES_CBC_decrypt_end +#endif +L_AES_CBC_decrypt_loop_block_192: + PUSH {r1, r12, lr} + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + LDR lr, [sp, #16] + STRD r4, r5, [lr, #16] + STRD r6, r7, [lr, #24] + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x5 + BL AES_decrypt_block + LDR lr, [sp, #16] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDM lr, {r8, r9, r10, r11} + POP {r1, r12, lr} + LDR r3, [sp] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CBC_decrypt_end_odd +#else + BEQ.W L_AES_CBC_decrypt_end_odd +#endif + PUSH {r1, r12, lr} + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + LDR lr, [sp, #16] + STRD r4, r5, [lr] + STRD r6, r7, [lr, #8] + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x5 + BL AES_decrypt_block + LDR lr, [sp, #16] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDRD r8, r9, [lr, #16] + LDRD r10, r11, [lr, #24] + POP {r1, r12, lr} + LDR r3, [sp] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CBC_decrypt_loop_block_192 +#else + BNE.W L_AES_CBC_decrypt_loop_block_192 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_CBC_decrypt_end +#else + B.W L_AES_CBC_decrypt_end +#endif +L_AES_CBC_decrypt_loop_block_128: + PUSH {r1, r12, lr} + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + LDR lr, [sp, #16] + STRD r4, r5, [lr, #16] + STRD r6, r7, [lr, #24] + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x4 + BL AES_decrypt_block + LDR lr, [sp, #16] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDM lr, {r8, r9, r10, r11} + POP {r1, r12, lr} + LDR r3, [sp] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_CBC_decrypt_end_odd +#else + BEQ.W L_AES_CBC_decrypt_end_odd +#endif + PUSH {r1, r12, lr} + LDR r4, [lr] + LDR r5, [lr, #4] + LDR r6, [lr, #8] + LDR r7, [lr, #12] + LDR lr, [sp, #16] + STRD r4, r5, [lr] + STRD r6, r7, [lr, #8] + LDM r3!, {r8, r9, r10, r11} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x4 + BL AES_decrypt_block + LDR lr, [sp, #16] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDRD r8, r9, [lr, #16] + LDRD r10, r11, [lr, #24] + POP {r1, r12, lr} + LDR r3, [sp] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + SUBS r12, r12, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_CBC_decrypt_loop_block_128 +#else + BNE.W L_AES_CBC_decrypt_loop_block_128 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_CBC_decrypt_end +#else + B.N L_AES_CBC_decrypt_end +#endif +L_AES_CBC_decrypt_end_odd: + LDR r4, [sp, #4] + LDRD r8, r9, [r4, #16] + LDRD r10, r11, [r4, #24] + STRD r8, r9, [r4] + STRD r10, r11, [r4, #8] +L_AES_CBC_decrypt_end: + POP {r3, r4} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 518 */ + .size AES_CBC_decrypt,.-AES_CBC_decrypt +#endif /* HAVE_AES_CBC */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC */ +#endif /* HAVE_AES_DECRYPT */ +#ifdef HAVE_AESGCM + .text + .type L_GCM_gmult_len_r, %object + .size L_GCM_gmult_len_r, 64 + .align 4 +L_GCM_gmult_len_r: + .word 0x0 + .word 0x1c200000 + .word 0x38400000 + .word 0x24600000 + .word 0x70800000 + .word 0x6ca00000 + .word 0x48c00000 + .word 0x54e00000 + .word 0xe1000000 + .word 0xfd200000 + .word 0xd9400000 + .word 0xc5600000 + .word 0x91800000 + .word 0x8da00000 + .word 0xa9c00000 + .word 0xb5e00000 + .text + .align 4 + .globl GCM_gmult_len + .type GCM_gmult_len, %function +GCM_gmult_len: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ADR lr, L_GCM_gmult_len_r +L_GCM_gmult_len_start_block: + PUSH {r3} + LDR r12, [r0, #12] + LDR r3, [r2, #12] + EOR r12, r12, r3 + LSR r3, r12, #24 + AND r3, r3, #0xf + ADD r3, r1, r3, LSL #4 + LDM r3, {r8, r9, r10, r11} + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #28 + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #16 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #20 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #8 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #12 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + AND r4, r12, #0xf + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #4 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + LDR r12, [r0, #8] + LDR r3, [r2, #8] + EOR r12, r12, r3 + LSR r3, r12, #24 + AND r3, r3, #0xf + ADD r3, r1, r3, LSL #4 + LDM r3, {r4, r5, r6, r7} + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #28 + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #16 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #20 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #8 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #12 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + AND r4, r12, #0xf + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #4 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + LDR r12, [r0, #4] + LDR r3, [r2, #4] + EOR r12, r12, r3 + LSR r3, r12, #24 + AND r3, r3, #0xf + ADD r3, r1, r3, LSL #4 + LDM r3, {r4, r5, r6, r7} + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #28 + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #16 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #20 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #8 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #12 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + AND r4, r12, #0xf + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #4 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + LDR r12, [r0] + LDR r3, [r2] + EOR r12, r12, r3 + LSR r3, r12, #24 + AND r3, r3, #0xf + ADD r3, r1, r3, LSL #4 + LDM r3, {r4, r5, r6, r7} + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #28 + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #16 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #20 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #8 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #12 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + AND r4, r12, #0xf + EOR r11, r11, r10, LSL #28 + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + LSR r6, r10, #4 + AND r3, r11, #0xf + LSR r11, r11, #4 + LSR r4, r12, #4 + EOR r11, r11, r10, LSL #28 + AND r4, r4, #0xf + LDR r3, [lr, r3, LSL #2] + ADD r4, r1, r4, LSL #4 + EOR r10, r6, r9, LSL #28 + LSR r9, r9, #4 + LDM r4, {r4, r5, r6, r7} + EOR r9, r9, r8, LSL #28 + EOR r8, r3, r8, LSR #4 + EOR r8, r8, r4 + EOR r9, r9, r5 + EOR r10, r10, r6 + EOR r11, r11, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STM r0, {r8, r9, r10, r11} + POP {r3} + SUBS r3, r3, #0x10 + ADD r2, r2, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_GCM_gmult_len_start_block +#else + BNE.W L_GCM_gmult_len_start_block +#endif + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 742 */ + .size GCM_gmult_len,.-GCM_gmult_len + .text + .type L_AES_Thumb2_te_gcm, %object + .size L_AES_Thumb2_te_gcm, 12 + .align 4 +L_AES_Thumb2_te_gcm: + .word L_AES_Thumb2_te_data + .text + .align 4 + .globl AES_GCM_encrypt + .type AES_GCM_encrypt, %function +AES_GCM_encrypt: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + LDR r12, [sp, #36] + LDR r8, [sp, #40] + MOV lr, r0 + LDR r0, L_AES_Thumb2_te_gcm + LDM r8, {r4, r5, r6, r7} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r8, {r4, r5, r6, r7} + PUSH {r3, r8} + CMP r12, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_GCM_encrypt_start_block_128 +#else + BEQ.W L_AES_GCM_encrypt_start_block_128 +#endif + CMP r12, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_AES_GCM_encrypt_start_block_192 +#else + BEQ.W L_AES_GCM_encrypt_start_block_192 +#endif +L_AES_GCM_encrypt_loop_block_256: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + ADD r7, r7, #0x1 + LDM r3!, {r8, r9, r10, r11} + STR r7, [lr, #12] + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x6 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCM_encrypt_loop_block_256 +#else + BNE.W L_AES_GCM_encrypt_loop_block_256 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_GCM_encrypt_end +#else + B.W L_AES_GCM_encrypt_end +#endif +L_AES_GCM_encrypt_start_block_192: +L_AES_GCM_encrypt_loop_block_192: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + ADD r7, r7, #0x1 + LDM r3!, {r8, r9, r10, r11} + STR r7, [lr, #12] + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x5 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCM_encrypt_loop_block_192 +#else + BNE.W L_AES_GCM_encrypt_loop_block_192 +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_AES_GCM_encrypt_end +#else + B.W L_AES_GCM_encrypt_end +#endif +L_AES_GCM_encrypt_start_block_128: +L_AES_GCM_encrypt_loop_block_128: + PUSH {r1, r2, lr} + LDR lr, [sp, #16] + ADD r7, r7, #0x1 + LDM r3!, {r8, r9, r10, r11} + STR r7, [lr, #12] + /* Round: 0 - XOR in key schedule */ + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + MOV r1, #0x4 + BL AES_encrypt_block + POP {r1, r2, lr} + LDR r3, [sp] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + LDR r8, [lr] + LDR r9, [lr, #4] + LDR r10, [lr, #8] + LDR r11, [lr, #12] + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDR r8, [sp, #4] + STR r4, [r1] + STR r5, [r1, #4] + STR r6, [r1, #8] + STR r7, [r1, #12] + LDM r8, {r4, r5, r6, r7} + SUBS r2, r2, #0x10 + ADD lr, lr, #0x10 + ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_AES_GCM_encrypt_loop_block_128 +#else + BNE.W L_AES_GCM_encrypt_loop_block_128 +#endif +L_AES_GCM_encrypt_end: + POP {r3, r8} + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + STM r8, {r4, r5, r6, r7} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 275 */ + .size AES_GCM_encrypt,.-AES_GCM_encrypt +#endif /* HAVE_AESGCM */ +#endif /* !NO_AES */ +#endif /* WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c new file mode 100644 index 000000000..da16fdc10 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c @@ -0,0 +1,3352 @@ +/* thumb2-aes-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./aes/aes.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-aes-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#ifdef WOLFSSL_ARMASM_THUMB2 +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifndef NO_AES +#include + +#ifdef HAVE_AES_DECRYPT +XALIGNED(16) static const word32 L_AES_Thumb2_td_data[] = { + 0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e, + 0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303, + 0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c, + 0xfc4fe5d7, 0xd7c52acb, 0x80263544, 0x8fb562a3, + 0x49deb15a, 0x6725ba1b, 0x9845ea0e, 0xe15dfec0, + 0x02c32f75, 0x12814cf0, 0xa38d4697, 0xc66bd3f9, + 0xe7038f5f, 0x9515929c, 0xebbf6d7a, 0xda955259, + 0x2dd4be83, 0xd3587421, 0x2949e069, 0x448ec9c8, + 0x6a75c289, 0x78f48e79, 0x6b99583e, 0xdd27b971, + 0xb6bee14f, 0x17f088ad, 0x66c920ac, 0xb47dce3a, + 0x1863df4a, 0x82e51a31, 0x60975133, 0x4562537f, + 0xe0b16477, 0x84bb6bae, 0x1cfe81a0, 0x94f9082b, + 0x58704868, 0x198f45fd, 0x8794de6c, 0xb7527bf8, + 0x23ab73d3, 0xe2724b02, 0x57e31f8f, 0x2a6655ab, + 0x07b2eb28, 0x032fb5c2, 0x9a86c57b, 0xa5d33708, + 0xf2302887, 0xb223bfa5, 0xba02036a, 0x5ced1682, + 0x2b8acf1c, 0x92a779b4, 0xf0f307f2, 0xa14e69e2, + 0xcd65daf4, 0xd50605be, 0x1fd13462, 0x8ac4a6fe, + 0x9d342e53, 0xa0a2f355, 0x32058ae1, 0x75a4f6eb, + 0x390b83ec, 0xaa4060ef, 0x065e719f, 0x51bd6e10, + 0xf93e218a, 0x3d96dd06, 0xaedd3e05, 0x464de6bd, + 0xb591548d, 0x0571c45d, 0x6f0406d4, 0xff605015, + 0x241998fb, 0x97d6bde9, 0xcc894043, 0x7767d99e, + 0xbdb0e842, 0x8807898b, 0x38e7195b, 0xdb79c8ee, + 0x47a17c0a, 0xe97c420f, 0xc9f8841e, 0x00000000, + 0x83098086, 0x48322bed, 0xac1e1170, 0x4e6c5a72, + 0xfbfd0eff, 0x560f8538, 0x1e3daed5, 0x27362d39, + 0x640a0fd9, 0x21685ca6, 0xd19b5b54, 0x3a24362e, + 0xb10c0a67, 0x0f9357e7, 0xd2b4ee96, 0x9e1b9b91, + 0x4f80c0c5, 0xa261dc20, 0x695a774b, 0x161c121a, + 0x0ae293ba, 0xe5c0a02a, 0x433c22e0, 0x1d121b17, + 0x0b0e090d, 0xadf28bc7, 0xb92db6a8, 0xc8141ea9, + 0x8557f119, 0x4caf7507, 0xbbee99dd, 0xfda37f60, + 0x9ff70126, 0xbc5c72f5, 0xc544663b, 0x345bfb7e, + 0x768b4329, 0xdccb23c6, 0x68b6edfc, 0x63b8e4f1, + 0xcad731dc, 0x10426385, 0x40139722, 0x2084c611, + 0x7d854a24, 0xf8d2bb3d, 0x11aef932, 0x6dc729a1, + 0x4b1d9e2f, 0xf3dcb230, 0xec0d8652, 0xd077c1e3, + 0x6c2bb316, 0x99a970b9, 0xfa119448, 0x2247e964, + 0xc4a8fc8c, 0x1aa0f03f, 0xd8567d2c, 0xef223390, + 0xc787494e, 0xc1d938d1, 0xfe8ccaa2, 0x3698d40b, + 0xcfa6f581, 0x28a57ade, 0x26dab78e, 0xa43fadbf, + 0xe42c3a9d, 0x0d507892, 0x9b6a5fcc, 0x62547e46, + 0xc2f68d13, 0xe890d8b8, 0x5e2e39f7, 0xf582c3af, + 0xbe9f5d80, 0x7c69d093, 0xa96fd52d, 0xb3cf2512, + 0x3bc8ac99, 0xa710187d, 0x6ee89c63, 0x7bdb3bbb, + 0x09cd2678, 0xf46e5918, 0x01ec9ab7, 0xa8834f9a, + 0x65e6956e, 0x7eaaffe6, 0x0821bccf, 0xe6ef15e8, + 0xd9bae79b, 0xce4a6f36, 0xd4ea9f09, 0xd629b07c, + 0xaf31a4b2, 0x312a3f23, 0x30c6a594, 0xc035a266, + 0x37744ebc, 0xa6fc82ca, 0xb0e090d0, 0x1533a7d8, + 0x4af10498, 0xf741ecda, 0x0e7fcd50, 0x2f1791f6, + 0x8d764dd6, 0x4d43efb0, 0x54ccaa4d, 0xdfe49604, + 0xe39ed1b5, 0x1b4c6a88, 0xb8c12c1f, 0x7f466551, + 0x049d5eea, 0x5d018c35, 0x73fa8774, 0x2efb0b41, + 0x5ab3671d, 0x5292dbd2, 0x33e91056, 0x136dd647, + 0x8c9ad761, 0x7a37a10c, 0x8e59f814, 0x89eb133c, + 0xeecea927, 0x35b761c9, 0xede11ce5, 0x3c7a47b1, + 0x599cd2df, 0x3f55f273, 0x791814ce, 0xbf73c737, + 0xea53f7cd, 0x5b5ffdaa, 0x14df3d6f, 0x867844db, + 0x81caaff3, 0x3eb968c4, 0x2c382434, 0x5fc2a340, + 0x72161dc3, 0x0cbce225, 0x8b283c49, 0x41ff0d95, + 0x7139a801, 0xde080cb3, 0x9cd8b4e4, 0x906456c1, + 0x617bcb84, 0x70d532b6, 0x74486c5c, 0x42d0b857, +}; + +#endif /* HAVE_AES_DECRYPT */ +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +XALIGNED(16) static const word32 L_AES_Thumb2_te_data[] = { + 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, + 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5, + 0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b, + 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676, + 0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d, + 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0, + 0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf, + 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0, + 0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626, + 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc, + 0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1, + 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515, + 0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3, + 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a, + 0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2, + 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575, + 0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a, + 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0, + 0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3, + 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484, + 0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded, + 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b, + 0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939, + 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf, + 0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb, + 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585, + 0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f, + 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8, + 0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f, + 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5, + 0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121, + 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2, + 0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec, + 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717, + 0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d, + 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373, + 0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc, + 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888, + 0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414, + 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb, + 0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a, + 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c, + 0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262, + 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979, + 0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d, + 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9, + 0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea, + 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808, + 0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e, + 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6, + 0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f, + 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a, + 0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666, + 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e, + 0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9, + 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e, + 0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111, + 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494, + 0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9, + 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf, + 0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d, + 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868, + 0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f, + 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616, +}; + +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +static const word32* L_AES_Thumb2_td = L_AES_Thumb2_td_data; +#endif /* HAVE_AES_DECRYPT */ +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +static const word32* L_AES_Thumb2_te = L_AES_Thumb2_te_data; +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +void AES_invert_key(unsigned char* ks, word32 rounds); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_invert_key(unsigned char* ks_p, word32 rounds_p) +#else +void AES_invert_key(unsigned char* ks, word32 rounds) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register unsigned char* ks __asm__ ("r0") = (unsigned char*)ks_p; + register word32 rounds __asm__ ("r1") = (word32)rounds_p; + register word32* L_AES_Thumb2_te_c __asm__ ("r2") = (word32*)L_AES_Thumb2_te; + register word32* L_AES_Thumb2_td_c __asm__ ("r3") = (word32*)L_AES_Thumb2_td; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV r12, %[L_AES_Thumb2_te]\n\t" + "MOV lr, %[L_AES_Thumb2_td]\n\t" + "ADD r10, %[ks], %[rounds], LSL #4\n\t" + "MOV r11, %[rounds]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_invert_key_loop:\n\t" +#else + "L_AES_invert_key_loop_%=:\n\t" +#endif + "LDM %[ks], {r2, r3, r4, r5}\n\t" + "LDM r10, {r6, r7, r8, r9}\n\t" + "STM r10, {r2, r3, r4, r5}\n\t" + "STM %[ks]!, {r6, r7, r8, r9}\n\t" + "SUBS r11, r11, #0x2\n\t" + "SUB r10, r10, #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_invert_key_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_AES_invert_key_loop\n\t" +#else + "BNE.N L_AES_invert_key_loop_%=\n\t" +#endif + "SUB %[ks], %[ks], %[rounds], LSL #3\n\t" + "ADD %[ks], %[ks], #0x10\n\t" + "SUB r11, %[rounds], #0x1\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_invert_key_mix_loop:\n\t" +#else + "L_AES_invert_key_mix_loop_%=:\n\t" +#endif + "LDM %[ks], {r2, r3, r4, r5}\n\t" + "UBFX r6, r2, #0, #8\n\t" + "UBFX r7, r2, #8, #8\n\t" + "UBFX r8, r2, #16, #8\n\t" + "LSR r9, r2, #24\n\t" + "LDRB r6, [r12, r6, LSL #2]\n\t" + "LDRB r7, [r12, r7, LSL #2]\n\t" + "LDRB r8, [r12, r8, LSL #2]\n\t" + "LDRB r9, [r12, r9, LSL #2]\n\t" + "LDR r6, [lr, r6, LSL #2]\n\t" + "LDR r7, [lr, r7, LSL #2]\n\t" + "LDR r8, [lr, r8, LSL #2]\n\t" + "LDR r9, [lr, r9, LSL #2]\n\t" + "EOR r8, r8, r6, ROR #16\n\t" + "EOR r8, r8, r7, ROR #8\n\t" + "EOR r8, r8, r9, ROR #24\n\t" + "STR r8, [%[ks]], #4\n\t" + "UBFX r6, r3, #0, #8\n\t" + "UBFX r7, r3, #8, #8\n\t" + "UBFX r8, r3, #16, #8\n\t" + "LSR r9, r3, #24\n\t" + "LDRB r6, [r12, r6, LSL #2]\n\t" + "LDRB r7, [r12, r7, LSL #2]\n\t" + "LDRB r8, [r12, r8, LSL #2]\n\t" + "LDRB r9, [r12, r9, LSL #2]\n\t" + "LDR r6, [lr, r6, LSL #2]\n\t" + "LDR r7, [lr, r7, LSL #2]\n\t" + "LDR r8, [lr, r8, LSL #2]\n\t" + "LDR r9, [lr, r9, LSL #2]\n\t" + "EOR r8, r8, r6, ROR #16\n\t" + "EOR r8, r8, r7, ROR #8\n\t" + "EOR r8, r8, r9, ROR #24\n\t" + "STR r8, [%[ks]], #4\n\t" + "UBFX r6, r4, #0, #8\n\t" + "UBFX r7, r4, #8, #8\n\t" + "UBFX r8, r4, #16, #8\n\t" + "LSR r9, r4, #24\n\t" + "LDRB r6, [r12, r6, LSL #2]\n\t" + "LDRB r7, [r12, r7, LSL #2]\n\t" + "LDRB r8, [r12, r8, LSL #2]\n\t" + "LDRB r9, [r12, r9, LSL #2]\n\t" + "LDR r6, [lr, r6, LSL #2]\n\t" + "LDR r7, [lr, r7, LSL #2]\n\t" + "LDR r8, [lr, r8, LSL #2]\n\t" + "LDR r9, [lr, r9, LSL #2]\n\t" + "EOR r8, r8, r6, ROR #16\n\t" + "EOR r8, r8, r7, ROR #8\n\t" + "EOR r8, r8, r9, ROR #24\n\t" + "STR r8, [%[ks]], #4\n\t" + "UBFX r6, r5, #0, #8\n\t" + "UBFX r7, r5, #8, #8\n\t" + "UBFX r8, r5, #16, #8\n\t" + "LSR r9, r5, #24\n\t" + "LDRB r6, [r12, r6, LSL #2]\n\t" + "LDRB r7, [r12, r7, LSL #2]\n\t" + "LDRB r8, [r12, r8, LSL #2]\n\t" + "LDRB r9, [r12, r9, LSL #2]\n\t" + "LDR r6, [lr, r6, LSL #2]\n\t" + "LDR r7, [lr, r7, LSL #2]\n\t" + "LDR r8, [lr, r8, LSL #2]\n\t" + "LDR r9, [lr, r9, LSL #2]\n\t" + "EOR r8, r8, r6, ROR #16\n\t" + "EOR r8, r8, r7, ROR #8\n\t" + "EOR r8, r8, r9, ROR #24\n\t" + "STR r8, [%[ks]], #4\n\t" + "SUBS r11, r11, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_invert_key_mix_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_invert_key_mix_loop\n\t" +#else + "BNE.W L_AES_invert_key_mix_loop_%=\n\t" +#endif +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [ks] "+r" (ks), [rounds] "+r" (rounds), + [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" +#else + : [ks] "+r" (ks), [rounds] "+r" (rounds) + : [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_td] "r" (L_AES_Thumb2_td) + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#endif /* HAVE_AES_DECRYPT */ +XALIGNED(16) static const word32 L_AES_Thumb2_rcon[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, + 0x10000000, 0x20000000, 0x40000000, 0x80000000, + 0x1b000000, 0x36000000 +}; + +void AES_set_encrypt_key(const unsigned char* key, word32 len, + unsigned char* ks); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char* ks_p) +#else +void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* key __asm__ ("r0") = (const unsigned char*)key_p; + register word32 len __asm__ ("r1") = (word32)len_p; + register unsigned char* ks __asm__ ("r2") = (unsigned char*)ks_p; + register word32* L_AES_Thumb2_te_c __asm__ ("r3") = (word32*)L_AES_Thumb2_te; + register word32* L_AES_Thumb2_rcon_c __asm__ ("r4") = (word32*)&L_AES_Thumb2_rcon; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV r10, %[L_AES_Thumb2_te]\n\t" + "MOV lr, %[L_AES_Thumb2_rcon]\n\t" + "CMP %[len], #0x80\n\t" +#if defined(__GNUC__) + "BEQ L_AES_set_encrypt_key_start_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_set_encrypt_key_start_128\n\t" +#else + "BEQ.W L_AES_set_encrypt_key_start_128_%=\n\t" +#endif + "CMP %[len], #0xc0\n\t" +#if defined(__GNUC__) + "BEQ L_AES_set_encrypt_key_start_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_set_encrypt_key_start_192\n\t" +#else + "BEQ.W L_AES_set_encrypt_key_start_192_%=\n\t" +#endif + "LDR r4, [%[key]]\n\t" + "LDR r5, [%[key], #4]\n\t" + "LDR r6, [%[key], #8]\n\t" + "LDR r7, [%[key], #12]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM %[ks]!, {r4, r5, r6, r7}\n\t" + "LDR r4, [%[key], #16]\n\t" + "LDR r5, [%[key], #20]\n\t" + "LDR r6, [%[key], #24]\n\t" + "LDR r7, [%[key], #28]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" + "SUB %[ks], %[ks], #0x10\n\t" + "MOV r12, #0x6\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_set_encrypt_key_loop_256:\n\t" +#else + "L_AES_set_encrypt_key_loop_256_%=:\n\t" +#endif + "UBFX r4, r7, #0, #8\n\t" + "UBFX r5, r7, #8, #8\n\t" + "UBFX r6, r7, #16, #8\n\t" + "LSR r7, r7, #24\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r7, [r10, r7, LSL #2]\n\t" + "EOR r3, r7, r4, LSL #8\n\t" + "EOR r3, r3, r5, LSL #16\n\t" + "EOR r3, r3, r6, LSL #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7}\n\t" + "EOR r4, r4, r3\n\t" + "LDM lr!, {r3}\n\t" + "EOR r4, r4, r3\n\t" + "EOR r5, r5, r4\n\t" + "EOR r6, r6, r5\n\t" + "EOR r7, r7, r6\n\t" + "ADD %[ks], %[ks], #0x10\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" + "SUB %[ks], %[ks], #0x10\n\t" + "MOV r3, r7\n\t" + "UBFX r4, r3, #8, #8\n\t" + "UBFX r5, r3, #16, #8\n\t" + "LSR r6, r3, #24\n\t" + "UBFX r3, r3, #0, #8\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r3, [r10, r3, LSL #2]\n\t" + "EOR r3, r3, r4, LSL #8\n\t" + "EOR r3, r3, r5, LSL #16\n\t" + "EOR r3, r3, r6, LSL #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7}\n\t" + "EOR r4, r4, r3\n\t" + "EOR r5, r5, r4\n\t" + "EOR r6, r6, r5\n\t" + "EOR r7, r7, r6\n\t" + "ADD %[ks], %[ks], #0x10\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" + "SUB %[ks], %[ks], #0x10\n\t" + "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_set_encrypt_key_loop_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_AES_set_encrypt_key_loop_256\n\t" +#else + "BNE.N L_AES_set_encrypt_key_loop_256_%=\n\t" +#endif + "UBFX r4, r7, #0, #8\n\t" + "UBFX r5, r7, #8, #8\n\t" + "UBFX r6, r7, #16, #8\n\t" + "LSR r7, r7, #24\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r7, [r10, r7, LSL #2]\n\t" + "EOR r3, r7, r4, LSL #8\n\t" + "EOR r3, r3, r5, LSL #16\n\t" + "EOR r3, r3, r6, LSL #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7}\n\t" + "EOR r4, r4, r3\n\t" + "LDM lr!, {r3}\n\t" + "EOR r4, r4, r3\n\t" + "EOR r5, r5, r4\n\t" + "EOR r6, r6, r5\n\t" + "EOR r7, r7, r6\n\t" + "ADD %[ks], %[ks], #0x10\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" + "SUB %[ks], %[ks], #0x10\n\t" +#if defined(__GNUC__) + "B L_AES_set_encrypt_key_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_set_encrypt_key_end\n\t" +#else + "B.N L_AES_set_encrypt_key_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_set_encrypt_key_start_192:\n\t" +#else + "L_AES_set_encrypt_key_start_192_%=:\n\t" +#endif + "LDR r4, [%[key]]\n\t" + "LDR r5, [%[key], #4]\n\t" + "LDR r6, [%[key], #8]\n\t" + "LDR r7, [%[key], #12]\n\t" + "LDR r8, [%[key], #16]\n\t" + "LDR r9, [%[key], #20]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" + "STRD r8, r9, [%[ks], #16]\n\t" + "MOV r7, r9\n\t" + "MOV r12, #0x7\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_set_encrypt_key_loop_192:\n\t" +#else + "L_AES_set_encrypt_key_loop_192_%=:\n\t" +#endif + "UBFX r4, r9, #0, #8\n\t" + "UBFX r5, r9, #8, #8\n\t" + "UBFX r6, r9, #16, #8\n\t" + "LSR r9, r9, #24\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r9, [r10, r9, LSL #2]\n\t" + "EOR r3, r9, r4, LSL #8\n\t" + "EOR r3, r3, r5, LSL #16\n\t" + "EOR r3, r3, r6, LSL #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7, r8, r9}\n\t" + "EOR r4, r4, r3\n\t" + "LDM lr!, {r3}\n\t" + "EOR r4, r4, r3\n\t" + "EOR r5, r5, r4\n\t" + "EOR r6, r6, r5\n\t" + "EOR r7, r7, r6\n\t" + "EOR r8, r8, r7\n\t" + "EOR r9, r9, r8\n\t" + "STM %[ks], {r4, r5, r6, r7, r8, r9}\n\t" + "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_set_encrypt_key_loop_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_AES_set_encrypt_key_loop_192\n\t" +#else + "BNE.N L_AES_set_encrypt_key_loop_192_%=\n\t" +#endif + "UBFX r4, r9, #0, #8\n\t" + "UBFX r5, r9, #8, #8\n\t" + "UBFX r6, r9, #16, #8\n\t" + "LSR r9, r9, #24\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r9, [r10, r9, LSL #2]\n\t" + "EOR r3, r9, r4, LSL #8\n\t" + "EOR r3, r3, r5, LSL #16\n\t" + "EOR r3, r3, r6, LSL #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7, r8, r9}\n\t" + "EOR r4, r4, r3\n\t" + "LDM lr!, {r3}\n\t" + "EOR r4, r4, r3\n\t" + "EOR r5, r5, r4\n\t" + "EOR r6, r6, r5\n\t" + "EOR r7, r7, r6\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" +#if defined(__GNUC__) + "B L_AES_set_encrypt_key_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_set_encrypt_key_end\n\t" +#else + "B.N L_AES_set_encrypt_key_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_set_encrypt_key_start_128:\n\t" +#else + "L_AES_set_encrypt_key_start_128_%=:\n\t" +#endif + "LDR r4, [%[key]]\n\t" + "LDR r5, [%[key], #4]\n\t" + "LDR r6, [%[key], #8]\n\t" + "LDR r7, [%[key], #12]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" + "MOV r12, #0xa\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_set_encrypt_key_loop_128:\n\t" +#else + "L_AES_set_encrypt_key_loop_128_%=:\n\t" +#endif + "UBFX r4, r7, #0, #8\n\t" + "UBFX r5, r7, #8, #8\n\t" + "UBFX r6, r7, #16, #8\n\t" + "LSR r7, r7, #24\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r7, [r10, r7, LSL #2]\n\t" + "EOR r3, r7, r4, LSL #8\n\t" + "EOR r3, r3, r5, LSL #16\n\t" + "EOR r3, r3, r6, LSL #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7}\n\t" + "EOR r4, r4, r3\n\t" + "LDM lr!, {r3}\n\t" + "EOR r4, r4, r3\n\t" + "EOR r5, r5, r4\n\t" + "EOR r6, r6, r5\n\t" + "EOR r7, r7, r6\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" + "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_set_encrypt_key_loop_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_AES_set_encrypt_key_loop_128\n\t" +#else + "BNE.N L_AES_set_encrypt_key_loop_128_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_set_encrypt_key_end:\n\t" +#else + "L_AES_set_encrypt_key_end_%=:\n\t" +#endif +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), + [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c) + : + : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "cc" +#else + : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_rcon] "r" (L_AES_Thumb2_rcon) + : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +void AES_encrypt_block(const word32* te, int nr, int len, const word32* ks); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_encrypt_block(const word32* te_p, int nr_p, int len_p, const word32* ks_p) +#else +void AES_encrypt_block(const word32* te, int nr, int len, const word32* ks) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const word32* te __asm__ ("r0") = (const word32*)te_p; + register int nr __asm__ ("r1") = (int)nr_p; + register int len __asm__ ("r2") = (int)len_p; + register const word32* ks __asm__ ("r3") = (const word32*)ks_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_encrypt_block_nr:\n\t" +#else + "L_AES_encrypt_block_nr_%=:\n\t" +#endif + "UBFX r8, r5, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX lr, r6, #8, #8\n\t" + "UBFX r2, r7, #0, #8\n\t" + "LDR r8, [%[te], r8, LSL #2]\n\t" + "LDR r11, [%[te], r11, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r9, r6, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, lr, ROR #8\n\t" + "UBFX lr, r7, #8, #8\n\t" + "EOR r8, r8, r2, ROR #16\n\t" + "UBFX r2, r4, #0, #8\n\t" + "LDR r9, [%[te], r9, LSL #2]\n\t" + "LDR r11, [%[te], r11, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r10, r7, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, lr, ROR #8\n\t" + "UBFX lr, r4, #8, #8\n\t" + "EOR r9, r9, r2, ROR #16\n\t" + "UBFX r2, r5, #0, #8\n\t" + "LDR r10, [%[te], r10, LSL #2]\n\t" + "LDR r11, [%[te], r11, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r6, r6, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r4, #16, #8\n\t" + "EOR r10, r10, lr, ROR #8\n\t" + "LSR lr, r7, #24\n\t" + "EOR r10, r10, r2, ROR #16\n\t" + "UBFX r2, r5, #8, #8\n\t" + "LDR r6, [%[te], r6, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r11, [%[te], r11, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "EOR lr, lr, r6, ROR #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #24\n\t" + "EOR r11, r11, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r9, #16, #8\n\t" + "LSR r7, r8, #24\n\t" + "UBFX lr, r10, #8, #8\n\t" + "UBFX r2, r11, #0, #8\n\t" + "LDR r4, [%[te], r4, LSL #2]\n\t" + "LDR r7, [%[te], r7, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r5, r10, #16, #8\n\t" + "EOR r4, r4, r7, ROR #24\n\t" + "LSR r7, r9, #24\n\t" + "EOR r4, r4, lr, ROR #8\n\t" + "UBFX lr, r11, #8, #8\n\t" + "EOR r4, r4, r2, ROR #16\n\t" + "UBFX r2, r8, #0, #8\n\t" + "LDR r5, [%[te], r5, LSL #2]\n\t" + "LDR r7, [%[te], r7, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r6, r11, #16, #8\n\t" + "EOR r5, r5, r7, ROR #24\n\t" + "LSR r7, r10, #24\n\t" + "EOR r5, r5, lr, ROR #8\n\t" + "UBFX lr, r8, #8, #8\n\t" + "EOR r5, r5, r2, ROR #16\n\t" + "UBFX r2, r9, #0, #8\n\t" + "LDR r6, [%[te], r6, LSL #2]\n\t" + "LDR r7, [%[te], r7, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r10, r10, #0, #8\n\t" + "EOR r6, r6, r7, ROR #24\n\t" + "UBFX r7, r8, #16, #8\n\t" + "EOR r6, r6, lr, ROR #8\n\t" + "LSR lr, r11, #24\n\t" + "EOR r6, r6, r2, ROR #16\n\t" + "UBFX r2, r9, #8, #8\n\t" + "LDR r10, [%[te], r10, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r7, [%[te], r7, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "EOR lr, lr, r10, ROR #24\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, ROR #24\n\t" + "EOR r7, r7, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "SUBS %[nr], %[nr], #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_encrypt_block_nr_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_encrypt_block_nr\n\t" +#else + "BNE.W L_AES_encrypt_block_nr_%=\n\t" +#endif + "UBFX r8, r5, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX lr, r6, #8, #8\n\t" + "UBFX r2, r7, #0, #8\n\t" + "LDR r8, [%[te], r8, LSL #2]\n\t" + "LDR r11, [%[te], r11, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r9, r6, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, lr, ROR #8\n\t" + "UBFX lr, r7, #8, #8\n\t" + "EOR r8, r8, r2, ROR #16\n\t" + "UBFX r2, r4, #0, #8\n\t" + "LDR r9, [%[te], r9, LSL #2]\n\t" + "LDR r11, [%[te], r11, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r10, r7, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, lr, ROR #8\n\t" + "UBFX lr, r4, #8, #8\n\t" + "EOR r9, r9, r2, ROR #16\n\t" + "UBFX r2, r5, #0, #8\n\t" + "LDR r10, [%[te], r10, LSL #2]\n\t" + "LDR r11, [%[te], r11, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "UBFX r6, r6, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r4, #16, #8\n\t" + "EOR r10, r10, lr, ROR #8\n\t" + "LSR lr, r7, #24\n\t" + "EOR r10, r10, r2, ROR #16\n\t" + "UBFX r2, r5, #8, #8\n\t" + "LDR r6, [%[te], r6, LSL #2]\n\t" + "LDR lr, [%[te], lr, LSL #2]\n\t" + "LDR r11, [%[te], r11, LSL #2]\n\t" + "LDR r2, [%[te], r2, LSL #2]\n\t" + "EOR lr, lr, r6, ROR #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #24\n\t" + "EOR r11, r11, r2, ROR #8\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r11, #0, #8\n\t" + "UBFX r7, r10, #8, #8\n\t" + "UBFX lr, r9, #16, #8\n\t" + "LSR r2, r8, #24\n\t" + "LDRB r4, [%[te], r4, LSL #2]\n\t" + "LDRB r7, [%[te], r7, LSL #2]\n\t" + "LDRB lr, [%[te], lr, LSL #2]\n\t" + "LDRB r2, [%[te], r2, LSL #2]\n\t" + "UBFX r5, r8, #0, #8\n\t" + "EOR r4, r4, r7, LSL #8\n\t" + "UBFX r7, r11, #8, #8\n\t" + "EOR r4, r4, lr, LSL #16\n\t" + "UBFX lr, r10, #16, #8\n\t" + "EOR r4, r4, r2, LSL #24\n\t" + "LSR r2, r9, #24\n\t" + "LDRB r5, [%[te], r5, LSL #2]\n\t" + "LDRB r7, [%[te], r7, LSL #2]\n\t" + "LDRB lr, [%[te], lr, LSL #2]\n\t" + "LDRB r2, [%[te], r2, LSL #2]\n\t" + "UBFX r6, r9, #0, #8\n\t" + "EOR r5, r5, r7, LSL #8\n\t" + "UBFX r7, r8, #8, #8\n\t" + "EOR r5, r5, lr, LSL #16\n\t" + "UBFX lr, r11, #16, #8\n\t" + "EOR r5, r5, r2, LSL #24\n\t" + "LSR r2, r10, #24\n\t" + "LDRB r6, [%[te], r6, LSL #2]\n\t" + "LDRB r7, [%[te], r7, LSL #2]\n\t" + "LDRB lr, [%[te], lr, LSL #2]\n\t" + "LDRB r2, [%[te], r2, LSL #2]\n\t" + "LSR r11, r11, #24\n\t" + "EOR r6, r6, r7, LSL #8\n\t" + "UBFX r7, r10, #0, #8\n\t" + "EOR r6, r6, lr, LSL #16\n\t" + "UBFX lr, r9, #8, #8\n\t" + "EOR r6, r6, r2, LSL #24\n\t" + "UBFX r2, r8, #16, #8\n\t" + "LDRB r11, [%[te], r11, LSL #2]\n\t" + "LDRB r7, [%[te], r7, LSL #2]\n\t" + "LDRB lr, [%[te], lr, LSL #2]\n\t" + "LDRB r2, [%[te], r2, LSL #2]\n\t" + "EOR lr, lr, r11, LSL #16\n\t" + "LDM %[ks], {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, LSL #8\n\t" + "EOR r7, r7, r2, LSL #16\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + : [te] "+r" (te), [nr] "+r" (nr), [len] "+r" (len), [ks] "+r" (ks) + : + : "memory", "lr", "cc" + ); +} + +#if defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +static const word32* L_AES_Thumb2_te_ecb = L_AES_Thumb2_te_data; +#endif /* HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p) +#else +void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long len __asm__ ("r2") = (unsigned long)len_p; + register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; + register int nr __asm__ ("r4") = (int)nr_p; + register word32* L_AES_Thumb2_te_ecb_c __asm__ ("r5") = (word32*)L_AES_Thumb2_te_ecb; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV lr, %[in]\n\t" + "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + "PUSH {%[ks]}\n\t" + "CMP r12, #0xa\n\t" +#if defined(__GNUC__) + "BEQ L_AES_ECB_encrypt_start_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_ECB_encrypt_start_block_128\n\t" +#else + "BEQ.W L_AES_ECB_encrypt_start_block_128_%=\n\t" +#endif + "CMP r12, #0xc\n\t" +#if defined(__GNUC__) + "BEQ L_AES_ECB_encrypt_start_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_ECB_encrypt_start_block_192\n\t" +#else + "BEQ.W L_AES_ECB_encrypt_start_block_192_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_encrypt_loop_block_256:\n\t" +#else + "L_AES_ECB_encrypt_loop_block_256_%=:\n\t" +#endif + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "PUSH {r1, %[len], lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x6\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_ECB_encrypt_loop_block_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_ECB_encrypt_loop_block_256\n\t" +#else + "BNE.W L_AES_ECB_encrypt_loop_block_256_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_ECB_encrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_ECB_encrypt_end\n\t" +#else + "B.N L_AES_ECB_encrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_encrypt_start_block_192:\n\t" +#else + "L_AES_ECB_encrypt_start_block_192_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_encrypt_loop_block_192:\n\t" +#else + "L_AES_ECB_encrypt_loop_block_192_%=:\n\t" +#endif + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "PUSH {r1, %[len], lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x5\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_ECB_encrypt_loop_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_ECB_encrypt_loop_block_192\n\t" +#else + "BNE.W L_AES_ECB_encrypt_loop_block_192_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_ECB_encrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_ECB_encrypt_end\n\t" +#else + "B.N L_AES_ECB_encrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_encrypt_start_block_128:\n\t" +#else + "L_AES_ECB_encrypt_start_block_128_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_encrypt_loop_block_128:\n\t" +#else + "L_AES_ECB_encrypt_loop_block_128_%=:\n\t" +#endif + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "PUSH {r1, %[len], lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x4\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_ECB_encrypt_loop_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_ECB_encrypt_loop_block_128\n\t" +#else + "BNE.W L_AES_ECB_encrypt_loop_block_128_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_encrypt_end:\n\t" +#else + "L_AES_ECB_encrypt_end_%=:\n\t" +#endif + "POP {%[ks]}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) + : + : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11", "cc" +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) + : "memory", "r12", "lr", "r4", "r6", "r7", "r8", "r9", "r10", "r11", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)nr; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +} + +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_CBC +void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p) +#else +void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long len __asm__ ("r2") = (unsigned long)len_p; + register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; + register int nr __asm__ ("r4") = (int)nr_p; + register unsigned char* iv __asm__ ("r5") = (unsigned char*)iv_p; + register word32* L_AES_Thumb2_te_ecb_c __asm__ ("r6") = (word32*)L_AES_Thumb2_te_ecb; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r9, r5\n\t" +#else + "LDR r9, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + "MOV lr, %[in]\n\t" + "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" + "LDM r9, {r4, r5, r6, r7}\n\t" + "PUSH {%[ks], r9}\n\t" + "CMP r8, #0xa\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CBC_encrypt_start_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CBC_encrypt_start_block_128\n\t" +#else + "BEQ.W L_AES_CBC_encrypt_start_block_128_%=\n\t" +#endif + "CMP r8, #0xc\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CBC_encrypt_start_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CBC_encrypt_start_block_192\n\t" +#else + "BEQ.W L_AES_CBC_encrypt_start_block_192_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_encrypt_loop_block_256:\n\t" +#else + "L_AES_CBC_encrypt_loop_block_256_%=:\n\t" +#endif + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "PUSH {r1, %[len], lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x6\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CBC_encrypt_loop_block_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CBC_encrypt_loop_block_256\n\t" +#else + "BNE.W L_AES_CBC_encrypt_loop_block_256_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_CBC_encrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_CBC_encrypt_end\n\t" +#else + "B.N L_AES_CBC_encrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_encrypt_start_block_192:\n\t" +#else + "L_AES_CBC_encrypt_start_block_192_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_encrypt_loop_block_192:\n\t" +#else + "L_AES_CBC_encrypt_loop_block_192_%=:\n\t" +#endif + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "PUSH {r1, %[len], lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x5\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CBC_encrypt_loop_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CBC_encrypt_loop_block_192\n\t" +#else + "BNE.W L_AES_CBC_encrypt_loop_block_192_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_CBC_encrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_CBC_encrypt_end\n\t" +#else + "B.N L_AES_CBC_encrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_encrypt_start_block_128:\n\t" +#else + "L_AES_CBC_encrypt_start_block_128_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_encrypt_loop_block_128:\n\t" +#else + "L_AES_CBC_encrypt_loop_block_128_%=:\n\t" +#endif + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "PUSH {r1, %[len], lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x4\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CBC_encrypt_loop_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CBC_encrypt_loop_block_128\n\t" +#else + "BNE.W L_AES_CBC_encrypt_loop_block_128_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_encrypt_end:\n\t" +#else + "L_AES_CBC_encrypt_end_%=:\n\t" +#endif + "POP {%[ks], r9}\n\t" + "STM r9, {r4, r5, r6, r7}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) + : + : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc" +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) + : "memory", "r12", "lr", "r4", "r5", "r7", "r8", "r9", "r10", "r11", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)nr; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)iv; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +} + +#endif /* HAVE_AES_CBC */ +#ifdef WOLFSSL_AES_COUNTER +void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p) +#else +void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long len __asm__ ("r2") = (unsigned long)len_p; + register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; + register int nr __asm__ ("r4") = (int)nr_p; + register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p; + register word32* L_AES_Thumb2_te_ecb_c __asm__ ("r6") = (word32*)L_AES_Thumb2_te_ecb; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r8, r5\n\t" +#else + "LDR r8, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + "MOV lr, %[in]\n\t" + "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM r8, {r4, r5, r6, r7}\n\t" + "PUSH {%[ks], r8}\n\t" + "CMP r12, #0xa\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CTR_encrypt_start_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CTR_encrypt_start_block_128\n\t" +#else + "BEQ.W L_AES_CTR_encrypt_start_block_128_%=\n\t" +#endif + "CMP r12, #0xc\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CTR_encrypt_start_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CTR_encrypt_start_block_192\n\t" +#else + "BEQ.W L_AES_CTR_encrypt_start_block_192_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CTR_encrypt_loop_block_256:\n\t" +#else + "L_AES_CTR_encrypt_loop_block_256_%=:\n\t" +#endif + "PUSH {r1, %[len], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "ADDS r11, r7, #0x1\n\t" + "ADCS r10, r6, #0x0\n\t" + "ADCS r9, r5, #0x0\n\t" + "ADC r8, r4, #0x0\n\t" + "STM lr, {r8, r9, r10, r11}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x6\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CTR_encrypt_loop_block_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CTR_encrypt_loop_block_256\n\t" +#else + "BNE.W L_AES_CTR_encrypt_loop_block_256_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_CTR_encrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.W L_AES_CTR_encrypt_end\n\t" +#else + "B.W L_AES_CTR_encrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CTR_encrypt_start_block_192:\n\t" +#else + "L_AES_CTR_encrypt_start_block_192_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CTR_encrypt_loop_block_192:\n\t" +#else + "L_AES_CTR_encrypt_loop_block_192_%=:\n\t" +#endif + "PUSH {r1, %[len], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "ADDS r11, r7, #0x1\n\t" + "ADCS r10, r6, #0x0\n\t" + "ADCS r9, r5, #0x0\n\t" + "ADC r8, r4, #0x0\n\t" + "STM lr, {r8, r9, r10, r11}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x5\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CTR_encrypt_loop_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CTR_encrypt_loop_block_192\n\t" +#else + "BNE.W L_AES_CTR_encrypt_loop_block_192_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_CTR_encrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.W L_AES_CTR_encrypt_end\n\t" +#else + "B.W L_AES_CTR_encrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CTR_encrypt_start_block_128:\n\t" +#else + "L_AES_CTR_encrypt_start_block_128_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CTR_encrypt_loop_block_128:\n\t" +#else + "L_AES_CTR_encrypt_loop_block_128_%=:\n\t" +#endif + "PUSH {r1, %[len], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "ADDS r11, r7, #0x1\n\t" + "ADCS r10, r6, #0x0\n\t" + "ADCS r9, r5, #0x0\n\t" + "ADC r8, r4, #0x0\n\t" + "STM lr, {r8, r9, r10, r11}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x4\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CTR_encrypt_loop_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CTR_encrypt_loop_block_128\n\t" +#else + "BNE.W L_AES_CTR_encrypt_loop_block_128_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CTR_encrypt_end:\n\t" +#else + "L_AES_CTR_encrypt_end_%=:\n\t" +#endif + "POP {%[ks], r8}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM r8, {r4, r5, r6, r7}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) + : + : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc" +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) + : "memory", "r12", "lr", "r4", "r5", "r7", "r8", "r9", "r10", "r11", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)nr; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)ctr; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +} + +#endif /* WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC) +void AES_decrypt_block(const word32* td, int nr, const byte* td4); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_decrypt_block(const word32* td_p, int nr_p, const byte* td4_p) +#else +void AES_decrypt_block(const word32* td, int nr, const byte* td4) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const word32* td __asm__ ("r0") = (const word32*)td_p; + register int nr __asm__ ("r1") = (int)nr_p; + register const byte* td4 __asm__ ("r2") = (const byte*)td4_p; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_decrypt_block_nr:\n\t" +#else + "L_AES_decrypt_block_nr_%=:\n\t" +#endif + "UBFX r8, r7, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX r12, r6, #8, #8\n\t" + "UBFX lr, r5, #0, #8\n\t" + "LDR r8, [%[td], r8, LSL #2]\n\t" + "LDR r11, [%[td], r11, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r9, r4, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, r12, ROR #8\n\t" + "UBFX r12, r7, #8, #8\n\t" + "EOR r8, r8, lr, ROR #16\n\t" + "UBFX lr, r6, #0, #8\n\t" + "LDR r9, [%[td], r9, LSL #2]\n\t" + "LDR r11, [%[td], r11, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r10, r5, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, r12, ROR #8\n\t" + "UBFX r12, r4, #8, #8\n\t" + "EOR r9, r9, lr, ROR #16\n\t" + "UBFX lr, r7, #0, #8\n\t" + "LDR r10, [%[td], r10, LSL #2]\n\t" + "LDR r11, [%[td], r11, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r4, r4, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r6, #16, #8\n\t" + "EOR r10, r10, r12, ROR #8\n\t" + "LSR r12, r7, #24\n\t" + "EOR r10, r10, lr, ROR #16\n\t" + "UBFX lr, r5, #8, #8\n\t" + "LDR r4, [%[td], r4, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR r11, [%[td], r11, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "EOR r12, r12, r4, ROR #24\n\t" + "LDM r3!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #8\n\t" + "EOR r11, r11, r12, ROR #24\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r11, #16, #8\n\t" + "LSR r7, r8, #24\n\t" + "UBFX r12, r10, #8, #8\n\t" + "UBFX lr, r9, #0, #8\n\t" + "LDR r4, [%[td], r4, LSL #2]\n\t" + "LDR r7, [%[td], r7, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r5, r8, #16, #8\n\t" + "EOR r4, r4, r7, ROR #24\n\t" + "LSR r7, r9, #24\n\t" + "EOR r4, r4, r12, ROR #8\n\t" + "UBFX r12, r11, #8, #8\n\t" + "EOR r4, r4, lr, ROR #16\n\t" + "UBFX lr, r10, #0, #8\n\t" + "LDR r5, [%[td], r5, LSL #2]\n\t" + "LDR r7, [%[td], r7, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r6, r9, #16, #8\n\t" + "EOR r5, r5, r7, ROR #24\n\t" + "LSR r7, r10, #24\n\t" + "EOR r5, r5, r12, ROR #8\n\t" + "UBFX r12, r8, #8, #8\n\t" + "EOR r5, r5, lr, ROR #16\n\t" + "UBFX lr, r11, #0, #8\n\t" + "LDR r6, [%[td], r6, LSL #2]\n\t" + "LDR r7, [%[td], r7, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r8, r8, #0, #8\n\t" + "EOR r6, r6, r7, ROR #24\n\t" + "UBFX r7, r10, #16, #8\n\t" + "EOR r6, r6, r12, ROR #8\n\t" + "LSR r12, r11, #24\n\t" + "EOR r6, r6, lr, ROR #16\n\t" + "UBFX lr, r9, #8, #8\n\t" + "LDR r8, [%[td], r8, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR r7, [%[td], r7, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "EOR r12, r12, r8, ROR #24\n\t" + "LDM r3!, {r8, r9, r10, r11}\n\t" + "EOR r7, r7, lr, ROR #8\n\t" + "EOR r7, r7, r12, ROR #24\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "SUBS %[nr], %[nr], #0x1\n\t" +#if defined(__GNUC__) + "BNE L_AES_decrypt_block_nr_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_decrypt_block_nr\n\t" +#else + "BNE.W L_AES_decrypt_block_nr_%=\n\t" +#endif + "UBFX r8, r7, #16, #8\n\t" + "LSR r11, r4, #24\n\t" + "UBFX r12, r6, #8, #8\n\t" + "UBFX lr, r5, #0, #8\n\t" + "LDR r8, [%[td], r8, LSL #2]\n\t" + "LDR r11, [%[td], r11, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r9, r4, #16, #8\n\t" + "EOR r8, r8, r11, ROR #24\n\t" + "LSR r11, r5, #24\n\t" + "EOR r8, r8, r12, ROR #8\n\t" + "UBFX r12, r7, #8, #8\n\t" + "EOR r8, r8, lr, ROR #16\n\t" + "UBFX lr, r6, #0, #8\n\t" + "LDR r9, [%[td], r9, LSL #2]\n\t" + "LDR r11, [%[td], r11, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r10, r5, #16, #8\n\t" + "EOR r9, r9, r11, ROR #24\n\t" + "LSR r11, r6, #24\n\t" + "EOR r9, r9, r12, ROR #8\n\t" + "UBFX r12, r4, #8, #8\n\t" + "EOR r9, r9, lr, ROR #16\n\t" + "UBFX lr, r7, #0, #8\n\t" + "LDR r10, [%[td], r10, LSL #2]\n\t" + "LDR r11, [%[td], r11, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "UBFX r4, r4, #0, #8\n\t" + "EOR r10, r10, r11, ROR #24\n\t" + "UBFX r11, r6, #16, #8\n\t" + "EOR r10, r10, r12, ROR #8\n\t" + "LSR r12, r7, #24\n\t" + "EOR r10, r10, lr, ROR #16\n\t" + "UBFX lr, r5, #8, #8\n\t" + "LDR r4, [%[td], r4, LSL #2]\n\t" + "LDR r12, [%[td], r12, LSL #2]\n\t" + "LDR r11, [%[td], r11, LSL #2]\n\t" + "LDR lr, [%[td], lr, LSL #2]\n\t" + "EOR r12, r12, r4, ROR #24\n\t" + "LDM r3!, {r4, r5, r6, r7}\n\t" + "EOR r11, r11, lr, ROR #8\n\t" + "EOR r11, r11, r12, ROR #24\n\t" + /* XOR in Key Schedule */ + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "UBFX r4, r9, #0, #8\n\t" + "UBFX r7, r10, #8, #8\n\t" + "UBFX r12, r11, #16, #8\n\t" + "LSR lr, r8, #24\n\t" + "LDRB r4, [%[td4], r4]\n\t" + "LDRB r7, [%[td4], r7]\n\t" + "LDRB r12, [%[td4], r12]\n\t" + "LDRB lr, [%[td4], lr]\n\t" + "UBFX r5, r10, #0, #8\n\t" + "EOR r4, r4, r7, LSL #8\n\t" + "UBFX r7, r11, #8, #8\n\t" + "EOR r4, r4, r12, LSL #16\n\t" + "UBFX r12, r8, #16, #8\n\t" + "EOR r4, r4, lr, LSL #24\n\t" + "LSR lr, r9, #24\n\t" + "LDRB r7, [%[td4], r7]\n\t" + "LDRB lr, [%[td4], lr]\n\t" + "LDRB r5, [%[td4], r5]\n\t" + "LDRB r12, [%[td4], r12]\n\t" + "UBFX r6, r11, #0, #8\n\t" + "EOR r5, r5, r7, LSL #8\n\t" + "UBFX r7, r8, #8, #8\n\t" + "EOR r5, r5, r12, LSL #16\n\t" + "UBFX r12, r9, #16, #8\n\t" + "EOR r5, r5, lr, LSL #24\n\t" + "LSR lr, r10, #24\n\t" + "LDRB r7, [%[td4], r7]\n\t" + "LDRB lr, [%[td4], lr]\n\t" + "LDRB r6, [%[td4], r6]\n\t" + "LDRB r12, [%[td4], r12]\n\t" + "LSR r11, r11, #24\n\t" + "EOR r6, r6, r7, LSL #8\n\t" + "UBFX r7, r8, #0, #8\n\t" + "EOR r6, r6, r12, LSL #16\n\t" + "UBFX r12, r9, #8, #8\n\t" + "EOR r6, r6, lr, LSL #24\n\t" + "UBFX lr, r10, #16, #8\n\t" + "LDRB r11, [%[td4], r11]\n\t" + "LDRB r12, [%[td4], r12]\n\t" + "LDRB r7, [%[td4], r7]\n\t" + "LDRB lr, [%[td4], lr]\n\t" + "EOR r12, r12, r11, LSL #16\n\t" + "LDM r3, {r8, r9, r10, r11}\n\t" + "EOR r7, r7, r12, LSL #8\n\t" + "EOR r7, r7, lr, LSL #16\n\t" + /* XOR in Key Schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + : [td] "+r" (td), [nr] "+r" (nr), [td4] "+r" (td4) + : + : "memory", "lr", "cc" + ); +} + +static const word32* L_AES_Thumb2_td_ecb = L_AES_Thumb2_td_data; +static const byte L_AES_Thumb2_td4[] = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, + 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, + 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, + 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, + 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, + 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, + 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, + 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, + 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, + 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, + 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, +}; + +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p) +#else +void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long len __asm__ ("r2") = (unsigned long)len_p; + register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; + register int nr __asm__ ("r4") = (int)nr_p; + register word32* L_AES_Thumb2_td_ecb_c __asm__ ("r5") = (word32*)L_AES_Thumb2_td_ecb; + register byte* L_AES_Thumb2_td4_c __asm__ ("r6") = (byte*)&L_AES_Thumb2_td4; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + "MOV lr, %[in]\n\t" + "MOV r0, %[L_AES_Thumb2_td_ecb]\n\t" + "MOV r12, %[len]\n\t" + "MOV r2, %[L_AES_Thumb2_td4]\n\t" + "CMP r8, #0xa\n\t" +#if defined(__GNUC__) + "BEQ L_AES_ECB_decrypt_start_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_ECB_decrypt_start_block_128\n\t" +#else + "BEQ.W L_AES_ECB_decrypt_start_block_128_%=\n\t" +#endif + "CMP r8, #0xc\n\t" +#if defined(__GNUC__) + "BEQ L_AES_ECB_decrypt_start_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_ECB_decrypt_start_block_192\n\t" +#else + "BEQ.W L_AES_ECB_decrypt_start_block_192_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_decrypt_loop_block_256:\n\t" +#else + "L_AES_ECB_decrypt_loop_block_256_%=:\n\t" +#endif + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "PUSH {r1, %[ks], r12, lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x6\n\t" + "BL AES_decrypt_block\n\t" + "POP {r1, %[ks], r12, lr}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_ECB_decrypt_loop_block_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_ECB_decrypt_loop_block_256\n\t" +#else + "BNE.W L_AES_ECB_decrypt_loop_block_256_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_ECB_decrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_ECB_decrypt_end\n\t" +#else + "B.N L_AES_ECB_decrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_decrypt_start_block_192:\n\t" +#else + "L_AES_ECB_decrypt_start_block_192_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_decrypt_loop_block_192:\n\t" +#else + "L_AES_ECB_decrypt_loop_block_192_%=:\n\t" +#endif + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "PUSH {r1, %[ks], r12, lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x5\n\t" + "BL AES_decrypt_block\n\t" + "POP {r1, %[ks], r12, lr}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_ECB_decrypt_loop_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_ECB_decrypt_loop_block_192\n\t" +#else + "BNE.W L_AES_ECB_decrypt_loop_block_192_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_ECB_decrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_ECB_decrypt_end\n\t" +#else + "B.N L_AES_ECB_decrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_decrypt_start_block_128:\n\t" +#else + "L_AES_ECB_decrypt_start_block_128_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_decrypt_loop_block_128:\n\t" +#else + "L_AES_ECB_decrypt_loop_block_128_%=:\n\t" +#endif + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "PUSH {r1, %[ks], r12, lr}\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x4\n\t" + "BL AES_decrypt_block\n\t" + "POP {r1, %[ks], r12, lr}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_ECB_decrypt_loop_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_ECB_decrypt_loop_block_128\n\t" +#else + "BNE.W L_AES_ECB_decrypt_loop_block_128_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_ECB_decrypt_end:\n\t" +#else + "L_AES_ECB_decrypt_end_%=:\n\t" +#endif +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), + [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) + : + : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc" +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4) + : "memory", "r12", "lr", "r4", "r7", "r8", "r9", "r10", "r11", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)nr; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +} + +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_CBC +void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p) +#else +void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long len __asm__ ("r2") = (unsigned long)len_p; + register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; + register int nr __asm__ ("r4") = (int)nr_p; + register unsigned char* iv __asm__ ("r5") = (unsigned char*)iv_p; + register word32* L_AES_Thumb2_td_ecb_c __asm__ ("r6") = (word32*)L_AES_Thumb2_td_ecb; + register byte* L_AES_Thumb2_td4_c __asm__ ("r7") = (byte*)&L_AES_Thumb2_td4; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r4, r5\n\t" +#else + "LDR r4, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + "MOV lr, %[in]\n\t" + "MOV r0, %[L_AES_Thumb2_td_ecb]\n\t" + "MOV r12, %[len]\n\t" + "MOV r2, %[L_AES_Thumb2_td4]\n\t" + "PUSH {%[ks], r4}\n\t" + "CMP r8, #0xa\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CBC_decrypt_loop_block_128\n\t" +#else + "BEQ.W L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#endif + "CMP r8, #0xc\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CBC_decrypt_loop_block_192\n\t" +#else + "BEQ.W L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_decrypt_loop_block_256:\n\t" +#else + "L_AES_CBC_decrypt_loop_block_256_%=:\n\t" +#endif + "PUSH {r1, r12, lr}\n\t" + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "LDR lr, [sp, #16]\n\t" + "STRD r4, r5, [lr, #16]\n\t" + "STRD r6, r7, [lr, #24]\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x6\n\t" + "BL AES_decrypt_block\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDM lr, {r8, r9, r10, r11}\n\t" + "POP {r1, r12, lr}\n\t" + "LDR %[ks], [sp]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" +#else + "BEQ.W L_AES_CBC_decrypt_end_odd_%=\n\t" +#endif + "PUSH {r1, r12, lr}\n\t" + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "LDR lr, [sp, #16]\n\t" + "STRD r4, r5, [lr]\n\t" + "STRD r6, r7, [lr, #8]\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x6\n\t" + "BL AES_decrypt_block\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDRD r8, r9, [lr, #16]\n\t" + "LDRD r10, r11, [lr, #24]\n\t" + "POP {r1, r12, lr}\n\t" + "LDR %[ks], [sp]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CBC_decrypt_loop_block_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CBC_decrypt_loop_block_256\n\t" +#else + "BNE.W L_AES_CBC_decrypt_loop_block_256_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_CBC_decrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.W L_AES_CBC_decrypt_end\n\t" +#else + "B.W L_AES_CBC_decrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_decrypt_loop_block_192:\n\t" +#else + "L_AES_CBC_decrypt_loop_block_192_%=:\n\t" +#endif + "PUSH {r1, r12, lr}\n\t" + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "LDR lr, [sp, #16]\n\t" + "STRD r4, r5, [lr, #16]\n\t" + "STRD r6, r7, [lr, #24]\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x5\n\t" + "BL AES_decrypt_block\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDM lr, {r8, r9, r10, r11}\n\t" + "POP {r1, r12, lr}\n\t" + "LDR %[ks], [sp]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" +#else + "BEQ.W L_AES_CBC_decrypt_end_odd_%=\n\t" +#endif + "PUSH {r1, r12, lr}\n\t" + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "LDR lr, [sp, #16]\n\t" + "STRD r4, r5, [lr]\n\t" + "STRD r6, r7, [lr, #8]\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x5\n\t" + "BL AES_decrypt_block\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDRD r8, r9, [lr, #16]\n\t" + "LDRD r10, r11, [lr, #24]\n\t" + "POP {r1, r12, lr}\n\t" + "LDR %[ks], [sp]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CBC_decrypt_loop_block_192\n\t" +#else + "BNE.W L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_CBC_decrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.W L_AES_CBC_decrypt_end\n\t" +#else + "B.W L_AES_CBC_decrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_decrypt_loop_block_128:\n\t" +#else + "L_AES_CBC_decrypt_loop_block_128_%=:\n\t" +#endif + "PUSH {r1, r12, lr}\n\t" + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "LDR lr, [sp, #16]\n\t" + "STRD r4, r5, [lr, #16]\n\t" + "STRD r6, r7, [lr, #24]\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x4\n\t" + "BL AES_decrypt_block\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDM lr, {r8, r9, r10, r11}\n\t" + "POP {r1, r12, lr}\n\t" + "LDR %[ks], [sp]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" +#else + "BEQ.W L_AES_CBC_decrypt_end_odd_%=\n\t" +#endif + "PUSH {r1, r12, lr}\n\t" + "LDR r4, [lr]\n\t" + "LDR r5, [lr, #4]\n\t" + "LDR r6, [lr, #8]\n\t" + "LDR r7, [lr, #12]\n\t" + "LDR lr, [sp, #16]\n\t" + "STRD r4, r5, [lr]\n\t" + "STRD r6, r7, [lr, #8]\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x4\n\t" + "BL AES_decrypt_block\n\t" + "LDR lr, [sp, #16]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDRD r8, r9, [lr, #16]\n\t" + "LDRD r10, r11, [lr, #24]\n\t" + "POP {r1, r12, lr}\n\t" + "LDR %[ks], [sp]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "SUBS r12, r12, #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_CBC_decrypt_loop_block_128\n\t" +#else + "BNE.W L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_CBC_decrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_AES_CBC_decrypt_end\n\t" +#else + "B.N L_AES_CBC_decrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_decrypt_end_odd:\n\t" +#else + "L_AES_CBC_decrypt_end_odd_%=:\n\t" +#endif + "LDR r4, [sp, #4]\n\t" + "LDRD r8, r9, [r4, #16]\n\t" + "LDRD r10, r11, [r4, #24]\n\t" + "STRD r8, r9, [r4]\n\t" + "STRD r10, r11, [r4, #8]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_CBC_decrypt_end:\n\t" +#else + "L_AES_CBC_decrypt_end_%=:\n\t" +#endif + "POP {%[ks], r4}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), + [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) + : + : "memory", "r12", "lr", "r8", "r9", "r10", "r11", "cc" +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4) + : "memory", "r12", "lr", "r4", "r5", "r8", "r9", "r10", "r11", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)nr; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)iv; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +} + +#endif /* HAVE_AES_CBC */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC */ +#endif /* HAVE_AES_DECRYPT */ +#ifdef HAVE_AESGCM +XALIGNED(16) static const word32 L_GCM_gmult_len_r[] = { + 0x00000000, 0x1c200000, 0x38400000, 0x24600000, + 0x70800000, 0x6ca00000, 0x48c00000, 0x54e00000, + 0xe1000000, 0xfd200000, 0xd9400000, 0xc5600000, + 0x91800000, 0x8da00000, 0xa9c00000, 0xb5e00000, +}; + +void GCM_gmult_len(unsigned char* x, const unsigned char** m, + const unsigned char* data, unsigned long len); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, const unsigned char* data_p, unsigned long len_p) +#else +void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned char* data, unsigned long len) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register unsigned char* x __asm__ ("r0") = (unsigned char*)x_p; + register const unsigned char** m __asm__ ("r1") = (const unsigned char**)m_p; + register const unsigned char* data __asm__ ("r2") = (const unsigned char*)data_p; + register unsigned long len __asm__ ("r3") = (unsigned long)len_p; + register word32* L_GCM_gmult_len_r_c __asm__ ("r4") = (word32*)&L_GCM_gmult_len_r; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV lr, %[L_GCM_gmult_len_r]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_GCM_gmult_len_start_block:\n\t" +#else + "L_GCM_gmult_len_start_block_%=:\n\t" +#endif + "PUSH {r3}\n\t" + "LDR r12, [r0, #12]\n\t" + "LDR %[len], [r2, #12]\n\t" + "EOR r12, r12, %[len]\n\t" + "LSR %[len], r12, #24\n\t" + "AND %[len], %[len], #0xf\n\t" + "ADD %[len], %[m], %[len], LSL #4\n\t" + "LDM %[len], {r8, r9, r10, r11}\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #28\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #16\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #20\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #8\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #12\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "AND r4, r12, #0xf\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #4\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "LDR r12, [r0, #8]\n\t" + "LDR %[len], [r2, #8]\n\t" + "EOR r12, r12, %[len]\n\t" + "LSR %[len], r12, #24\n\t" + "AND %[len], %[len], #0xf\n\t" + "ADD %[len], %[m], %[len], LSL #4\n\t" + "LDM %[len], {r4, r5, r6, r7}\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #28\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #16\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #20\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #8\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #12\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "AND r4, r12, #0xf\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #4\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "LDR r12, [r0, #4]\n\t" + "LDR %[len], [r2, #4]\n\t" + "EOR r12, r12, %[len]\n\t" + "LSR %[len], r12, #24\n\t" + "AND %[len], %[len], #0xf\n\t" + "ADD %[len], %[m], %[len], LSL #4\n\t" + "LDM %[len], {r4, r5, r6, r7}\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #28\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #16\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #20\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #8\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #12\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "AND r4, r12, #0xf\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #4\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "LDR r12, [r0]\n\t" + "LDR %[len], [r2]\n\t" + "EOR r12, r12, %[len]\n\t" + "LSR %[len], r12, #24\n\t" + "AND %[len], %[len], #0xf\n\t" + "ADD %[len], %[m], %[len], LSL #4\n\t" + "LDM %[len], {r4, r5, r6, r7}\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #28\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #16\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #20\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #8\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #12\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "AND r4, r12, #0xf\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LSR r6, r10, #4\n\t" + "AND %[len], r11, #0xf\n\t" + "LSR r11, r11, #4\n\t" + "LSR r4, r12, #4\n\t" + "EOR r11, r11, r10, LSL #28\n\t" + "AND r4, r4, #0xf\n\t" + "LDR %[len], [lr, r3, LSL #2]\n\t" + "ADD r4, %[m], r4, LSL #4\n\t" + "EOR r10, r6, r9, LSL #28\n\t" + "LSR r9, r9, #4\n\t" + "LDM r4, {r4, r5, r6, r7}\n\t" + "EOR r9, r9, r8, LSL #28\n\t" + "EOR r8, %[len], r8, LSR #4\n\t" + "EOR r8, r8, r4\n\t" + "EOR r9, r9, r5\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STM %[x], {r8, r9, r10, r11}\n\t" + "POP {r3}\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD %[data], %[data], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_GCM_gmult_len_start_block_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_GCM_gmult_len_start_block\n\t" +#else + "BNE.W L_GCM_gmult_len_start_block_%=\n\t" +#endif +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), + [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c) + : + : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" +#else + : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len) + : [L_GCM_gmult_len_r] "r" (L_GCM_gmult_len_r) + : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +static const word32* L_AES_Thumb2_te_gcm = L_AES_Thumb2_te_data; +void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p) +#else +void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register const unsigned char* in __asm__ ("r0") = (const unsigned char*)in_p; + register unsigned char* out __asm__ ("r1") = (unsigned char*)out_p; + register unsigned long len __asm__ ("r2") = (unsigned long)len_p; + register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; + register int nr __asm__ ("r4") = (int)nr_p; + register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p; + register word32* L_AES_Thumb2_te_gcm_c __asm__ ("r6") = (word32*)L_AES_Thumb2_te_gcm; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + "MOV r8, r5\n\t" +#else + "LDR r8, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + "MOV lr, %[in]\n\t" + "MOV r0, %[L_AES_Thumb2_te_gcm]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM r8, {r4, r5, r6, r7}\n\t" + "PUSH {%[ks], r8}\n\t" + "CMP r12, #0xa\n\t" +#if defined(__GNUC__) + "BEQ L_AES_GCM_encrypt_start_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_GCM_encrypt_start_block_128\n\t" +#else + "BEQ.W L_AES_GCM_encrypt_start_block_128_%=\n\t" +#endif + "CMP r12, #0xc\n\t" +#if defined(__GNUC__) + "BEQ L_AES_GCM_encrypt_start_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.W L_AES_GCM_encrypt_start_block_192\n\t" +#else + "BEQ.W L_AES_GCM_encrypt_start_block_192_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCM_encrypt_loop_block_256:\n\t" +#else + "L_AES_GCM_encrypt_loop_block_256_%=:\n\t" +#endif + "PUSH {r1, %[len], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "ADD r7, r7, #0x1\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "STR r7, [lr, #12]\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x6\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCM_encrypt_loop_block_256_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCM_encrypt_loop_block_256\n\t" +#else + "BNE.W L_AES_GCM_encrypt_loop_block_256_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_GCM_encrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.W L_AES_GCM_encrypt_end\n\t" +#else + "B.W L_AES_GCM_encrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCM_encrypt_start_block_192:\n\t" +#else + "L_AES_GCM_encrypt_start_block_192_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCM_encrypt_loop_block_192:\n\t" +#else + "L_AES_GCM_encrypt_loop_block_192_%=:\n\t" +#endif + "PUSH {r1, %[len], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "ADD r7, r7, #0x1\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "STR r7, [lr, #12]\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x5\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCM_encrypt_loop_block_192_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCM_encrypt_loop_block_192\n\t" +#else + "BNE.W L_AES_GCM_encrypt_loop_block_192_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_AES_GCM_encrypt_end_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.W L_AES_GCM_encrypt_end\n\t" +#else + "B.W L_AES_GCM_encrypt_end_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCM_encrypt_start_block_128:\n\t" +#else + "L_AES_GCM_encrypt_start_block_128_%=:\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCM_encrypt_loop_block_128:\n\t" +#else + "L_AES_GCM_encrypt_loop_block_128_%=:\n\t" +#endif + "PUSH {r1, %[len], lr}\n\t" + "LDR lr, [sp, #16]\n\t" + "ADD r7, r7, #0x1\n\t" + "LDM %[ks]!, {r8, r9, r10, r11}\n\t" + "STR r7, [lr, #12]\n\t" + /* Round: 0 - XOR in key schedule */ + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "MOV r1, #0x4\n\t" + "BL AES_encrypt_block\n\t" + "POP {r1, %[len], lr}\n\t" + "LDR %[ks], [sp]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "LDR r8, [lr]\n\t" + "LDR r9, [lr, #4]\n\t" + "LDR r10, [lr, #8]\n\t" + "LDR r11, [lr, #12]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDR r8, [sp, #4]\n\t" + "STR r4, [%[out]]\n\t" + "STR r5, [%[out], #4]\n\t" + "STR r6, [%[out], #8]\n\t" + "STR r7, [%[out], #12]\n\t" + "LDM r8, {r4, r5, r6, r7}\n\t" + "SUBS %[len], %[len], #0x10\n\t" + "ADD lr, lr, #0x10\n\t" + "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) + "BNE L_AES_GCM_encrypt_loop_block_128_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_AES_GCM_encrypt_loop_block_128\n\t" +#else + "BNE.W L_AES_GCM_encrypt_loop_block_128_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_AES_GCM_encrypt_end:\n\t" +#else + "L_AES_GCM_encrypt_end_%=:\n\t" +#endif + "POP {%[ks], r8}\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "STM r8, {r4, r5, r6, r7}\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c) + : + : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11", "cc" +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_te_gcm] "r" (L_AES_Thumb2_te_gcm) + : "memory", "r12", "lr", "r4", "r5", "r7", "r8", "r9", "r10", "r11", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)nr; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifdef WOLFSSL_NO_VAR_ASSIGN_REG + (void)ctr; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +} + +#endif /* HAVE_AESGCM */ +#endif /* !NO_AES */ +#endif /* WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S new file mode 100644 index 000000000..d004d6b67 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S @@ -0,0 +1,1490 @@ +/* thumb2-sha256-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha256.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha256-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#ifdef WOLFSSL_ARMASM_THUMB2 +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifndef NO_SHA256 +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA256_transform_len_k, %object + .size L_SHA256_transform_len_k, 256 + .align 4 +L_SHA256_transform_len_k: + .word 0x428a2f98 + .word 0x71374491 + .word 0xb5c0fbcf + .word 0xe9b5dba5 + .word 0x3956c25b + .word 0x59f111f1 + .word 0x923f82a4 + .word 0xab1c5ed5 + .word 0xd807aa98 + .word 0x12835b01 + .word 0x243185be + .word 0x550c7dc3 + .word 0x72be5d74 + .word 0x80deb1fe + .word 0x9bdc06a7 + .word 0xc19bf174 + .word 0xe49b69c1 + .word 0xefbe4786 + .word 0xfc19dc6 + .word 0x240ca1cc + .word 0x2de92c6f + .word 0x4a7484aa + .word 0x5cb0a9dc + .word 0x76f988da + .word 0x983e5152 + .word 0xa831c66d + .word 0xb00327c8 + .word 0xbf597fc7 + .word 0xc6e00bf3 + .word 0xd5a79147 + .word 0x6ca6351 + .word 0x14292967 + .word 0x27b70a85 + .word 0x2e1b2138 + .word 0x4d2c6dfc + .word 0x53380d13 + .word 0x650a7354 + .word 0x766a0abb + .word 0x81c2c92e + .word 0x92722c85 + .word 0xa2bfe8a1 + .word 0xa81a664b + .word 0xc24b8b70 + .word 0xc76c51a3 + .word 0xd192e819 + .word 0xd6990624 + .word 0xf40e3585 + .word 0x106aa070 + .word 0x19a4c116 + .word 0x1e376c08 + .word 0x2748774c + .word 0x34b0bcb5 + .word 0x391c0cb3 + .word 0x4ed8aa4a + .word 0x5b9cca4f + .word 0x682e6ff3 + .word 0x748f82ee + .word 0x78a5636f + .word 0x84c87814 + .word 0x8cc70208 + .word 0x90befffa + .word 0xa4506ceb + .word 0xbef9a3f7 + .word 0xc67178f2 + .text + .align 4 + .globl Transform_Sha256_Len + .type Transform_Sha256_Len, %function +Transform_Sha256_Len: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xc0 + ADR r3, L_SHA256_transform_len_k + /* Copy digest to add in at end */ + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + LDRD r10, r11, [r0, #24] + STRD r4, r5, [sp, #64] + STRD r6, r7, [sp, #72] + STRD r8, r9, [sp, #80] + STRD r10, r11, [sp, #88] + /* Start of loop processing a block */ +L_SHA256_transform_len_begin: + /* Load, Reverse and Store W - 64 bytes */ + LDR r4, [r1] + LDR r5, [r1, #4] + LDR r6, [r1, #8] + LDR r7, [r1, #12] + LDR r8, [r1, #16] + LDR r9, [r1, #20] + LDR r10, [r1, #24] + LDR r11, [r1, #28] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STRD r4, r5, [sp] + STRD r6, r7, [sp, #8] + STRD r8, r9, [sp, #16] + STRD r10, r11, [sp, #24] + LDR r4, [r1, #32] + LDR r5, [r1, #36] + LDR r6, [r1, #40] + LDR r7, [r1, #44] + LDR r8, [r1, #48] + LDR r9, [r1, #52] + LDR r10, [r1, #56] + LDR r11, [r1, #60] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STRD r4, r5, [sp, #32] + STRD r6, r7, [sp, #40] + STRD r8, r9, [sp, #48] + STRD r10, r11, [sp, #56] + LDR r11, [r0, #4] + LDR r4, [r0, #8] + EOR r11, r11, r4 + MOV r12, #0x3 + /* Start of 16 rounds */ +L_SHA256_transform_len_start: + /* Round 0 */ + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r9, [r0, #28] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp] + LDR r6, [r3] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r8, [r0, #12] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #12] + STR r9, [r0, #28] + /* Calc new W[0] */ + LDR r6, [sp, #56] + LDR r7, [sp, #36] + LDR r8, [sp, #4] + LDR r9, [sp] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp] + /* Round 1 */ + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r9, [r0, #24] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #4] + LDR r6, [r3, #4] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r8, [r0, #8] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #8] + STR r9, [r0, #24] + /* Calc new W[1] */ + LDR r6, [sp, #60] + LDR r7, [sp, #40] + LDR r8, [sp, #8] + LDR r9, [sp, #4] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #4] + /* Round 2 */ + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r9, [r0, #20] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #8] + LDR r6, [r3, #8] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r8, [r0, #4] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #4] + STR r9, [r0, #20] + /* Calc new W[2] */ + LDR r6, [sp] + LDR r7, [sp, #44] + LDR r8, [sp, #12] + LDR r9, [sp, #8] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #8] + /* Round 3 */ + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r9, [r0, #16] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #12] + LDR r6, [r3, #12] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r8, [r0] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0] + STR r9, [r0, #16] + /* Calc new W[3] */ + LDR r6, [sp, #4] + LDR r7, [sp, #48] + LDR r8, [sp, #16] + LDR r9, [sp, #12] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #12] + /* Round 4 */ + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r9, [r0, #12] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #16] + LDR r6, [r3, #16] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r8, [r0, #28] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #28] + STR r9, [r0, #12] + /* Calc new W[4] */ + LDR r6, [sp, #8] + LDR r7, [sp, #52] + LDR r8, [sp, #20] + LDR r9, [sp, #16] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #16] + /* Round 5 */ + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r9, [r0, #8] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #20] + LDR r6, [r3, #20] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #24] + STR r9, [r0, #8] + /* Calc new W[5] */ + LDR r6, [sp, #12] + LDR r7, [sp, #56] + LDR r8, [sp, #24] + LDR r9, [sp, #20] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #20] + /* Round 6 */ + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r9, [r0, #4] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #24] + LDR r6, [r3, #24] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r8, [r0, #20] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #20] + STR r9, [r0, #4] + /* Calc new W[6] */ + LDR r6, [sp, #16] + LDR r7, [sp, #60] + LDR r8, [sp, #28] + LDR r9, [sp, #24] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #24] + /* Round 7 */ + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r9, [r0] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #28] + LDR r6, [r3, #28] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #16] + STR r9, [r0] + /* Calc new W[7] */ + LDR r6, [sp, #20] + LDR r7, [sp] + LDR r8, [sp, #32] + LDR r9, [sp, #28] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #28] + /* Round 8 */ + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r9, [r0, #28] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #32] + LDR r6, [r3, #32] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r8, [r0, #12] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #12] + STR r9, [r0, #28] + /* Calc new W[8] */ + LDR r6, [sp, #24] + LDR r7, [sp, #4] + LDR r8, [sp, #36] + LDR r9, [sp, #32] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #32] + /* Round 9 */ + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r9, [r0, #24] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #36] + LDR r6, [r3, #36] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r8, [r0, #8] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #8] + STR r9, [r0, #24] + /* Calc new W[9] */ + LDR r6, [sp, #28] + LDR r7, [sp, #8] + LDR r8, [sp, #40] + LDR r9, [sp, #36] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #36] + /* Round 10 */ + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r9, [r0, #20] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #40] + LDR r6, [r3, #40] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r8, [r0, #4] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #4] + STR r9, [r0, #20] + /* Calc new W[10] */ + LDR r6, [sp, #32] + LDR r7, [sp, #12] + LDR r8, [sp, #44] + LDR r9, [sp, #40] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #40] + /* Round 11 */ + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r9, [r0, #16] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #44] + LDR r6, [r3, #44] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r8, [r0] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0] + STR r9, [r0, #16] + /* Calc new W[11] */ + LDR r6, [sp, #36] + LDR r7, [sp, #16] + LDR r8, [sp, #48] + LDR r9, [sp, #44] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #44] + /* Round 12 */ + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r9, [r0, #12] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #48] + LDR r6, [r3, #48] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r8, [r0, #28] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #28] + STR r9, [r0, #12] + /* Calc new W[12] */ + LDR r6, [sp, #40] + LDR r7, [sp, #20] + LDR r8, [sp, #52] + LDR r9, [sp, #48] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #48] + /* Round 13 */ + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r9, [r0, #8] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #52] + LDR r6, [r3, #52] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #24] + STR r9, [r0, #8] + /* Calc new W[13] */ + LDR r6, [sp, #44] + LDR r7, [sp, #24] + LDR r8, [sp, #56] + LDR r9, [sp, #52] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #52] + /* Round 14 */ + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r9, [r0, #4] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #56] + LDR r6, [r3, #56] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r8, [r0, #20] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #20] + STR r9, [r0, #4] + /* Calc new W[14] */ + LDR r6, [sp, #48] + LDR r7, [sp, #28] + LDR r8, [sp, #60] + LDR r9, [sp, #56] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #56] + /* Round 15 */ + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r9, [r0] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #60] + LDR r6, [r3, #60] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #16] + STR r9, [r0] + /* Calc new W[15] */ + LDR r6, [sp, #52] + LDR r7, [sp, #32] + LDR r8, [sp] + LDR r9, [sp, #60] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #60] + ADD r3, r3, #0x40 + SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_SHA256_transform_len_start +#else + BNE.W L_SHA256_transform_len_start +#endif + /* Round 0 */ + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r9, [r0, #28] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp] + LDR r6, [r3] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r8, [r0, #12] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #12] + STR r9, [r0, #28] + /* Round 1 */ + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r9, [r0, #24] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #4] + LDR r6, [r3, #4] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r8, [r0, #8] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #8] + STR r9, [r0, #24] + /* Round 2 */ + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r9, [r0, #20] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #8] + LDR r6, [r3, #8] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r8, [r0, #4] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #4] + STR r9, [r0, #20] + /* Round 3 */ + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r9, [r0, #16] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #12] + LDR r6, [r3, #12] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r8, [r0] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0] + STR r9, [r0, #16] + /* Round 4 */ + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r9, [r0, #12] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #16] + LDR r6, [r3, #16] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r8, [r0, #28] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #28] + STR r9, [r0, #12] + /* Round 5 */ + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r9, [r0, #8] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #20] + LDR r6, [r3, #20] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #24] + STR r9, [r0, #8] + /* Round 6 */ + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r9, [r0, #4] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #24] + LDR r6, [r3, #24] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r8, [r0, #20] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #20] + STR r9, [r0, #4] + /* Round 7 */ + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r9, [r0] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #28] + LDR r6, [r3, #28] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #16] + STR r9, [r0] + /* Round 8 */ + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r9, [r0, #28] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #32] + LDR r6, [r3, #32] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r8, [r0, #12] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #12] + STR r9, [r0, #28] + /* Round 9 */ + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r9, [r0, #24] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #36] + LDR r6, [r3, #36] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r8, [r0, #8] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #8] + STR r9, [r0, #24] + /* Round 10 */ + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r9, [r0, #20] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #40] + LDR r6, [r3, #40] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r8, [r0, #4] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #4] + STR r9, [r0, #20] + /* Round 11 */ + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r9, [r0, #16] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #44] + LDR r6, [r3, #44] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r8, [r0] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0] + STR r9, [r0, #16] + /* Round 12 */ + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r9, [r0, #12] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #48] + LDR r6, [r3, #48] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r8, [r0, #28] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #28] + STR r9, [r0, #12] + /* Round 13 */ + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r9, [r0, #8] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #52] + LDR r6, [r3, #52] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #24] + STR r9, [r0, #8] + /* Round 14 */ + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r9, [r0, #4] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #56] + LDR r6, [r3, #56] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r8, [r0, #20] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #20] + STR r9, [r0, #4] + /* Round 15 */ + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r9, [r0] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #60] + LDR r6, [r3, #60] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #16] + STR r9, [r0] + /* Add in digest from start */ + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [sp, #64] + LDRD r10, r11, [sp, #72] + ADD r4, r4, r8 + ADD r5, r5, r9 + ADD r6, r6, r10 + ADD r7, r7, r11 + STRD r4, r5, [r0] + STRD r6, r7, [r0, #8] + STRD r4, r5, [sp, #64] + STRD r6, r7, [sp, #72] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [sp, #80] + LDRD r10, r11, [sp, #88] + ADD r4, r4, r8 + ADD r5, r5, r9 + ADD r6, r6, r10 + ADD r7, r7, r11 + STRD r4, r5, [r0, #16] + STRD r6, r7, [r0, #24] + STRD r4, r5, [sp, #80] + STRD r6, r7, [sp, #88] + SUBS r2, r2, #0x40 + SUB r3, r3, #0xc0 + ADD r1, r1, #0x40 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_SHA256_transform_len_begin +#else + BNE.W L_SHA256_transform_len_begin +#endif + ADD sp, sp, #0xc0 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 1874 */ + .size Transform_Sha256_Len,.-Transform_Sha256_Len +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* !NO_SHA256 */ +#endif /* WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c new file mode 100644 index 000000000..0b7642ee6 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c @@ -0,0 +1,1480 @@ +/* thumb2-sha256-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha256.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha256-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#ifdef WOLFSSL_ARMASM_THUMB2 +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifndef NO_SHA256 +#include + +#ifdef WOLFSSL_ARMASM_NO_NEON +XALIGNED(16) static const word32 L_SHA256_transform_len_k[] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) +#else +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register wc_Sha256* sha256 __asm__ ("r0") = (wc_Sha256*)sha256_p; + register const byte* data __asm__ ("r1") = (const byte*)data_p; + register word32 len __asm__ ("r2") = (word32)len_p; + register word32* L_SHA256_transform_len_k_c __asm__ ("r3") = (word32*)&L_SHA256_transform_len_k; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "SUB sp, sp, #0xc0\n\t" + "MOV r3, %[L_SHA256_transform_len_k]\n\t" + /* Copy digest to add in at end */ + "LDRD r4, r5, [%[sha256]]\n\t" + "LDRD r6, r7, [%[sha256], #8]\n\t" + "LDRD r8, r9, [%[sha256], #16]\n\t" + "LDRD r10, r11, [%[sha256], #24]\n\t" + "STRD r4, r5, [sp, #64]\n\t" + "STRD r6, r7, [sp, #72]\n\t" + "STRD r8, r9, [sp, #80]\n\t" + "STRD r10, r11, [sp, #88]\n\t" + /* Start of loop processing a block */ + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_SHA256_transform_len_begin:\n\t" +#else + "L_SHA256_transform_len_begin_%=:\n\t" +#endif + /* Load, Reverse and Store W - 64 bytes */ + "LDR r4, [%[data]]\n\t" + "LDR r5, [%[data], #4]\n\t" + "LDR r6, [%[data], #8]\n\t" + "LDR r7, [%[data], #12]\n\t" + "LDR r8, [%[data], #16]\n\t" + "LDR r9, [%[data], #20]\n\t" + "LDR r10, [%[data], #24]\n\t" + "LDR r11, [%[data], #28]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STRD r4, r5, [sp]\n\t" + "STRD r6, r7, [sp, #8]\n\t" + "STRD r8, r9, [sp, #16]\n\t" + "STRD r10, r11, [sp, #24]\n\t" + "LDR r4, [%[data], #32]\n\t" + "LDR r5, [%[data], #36]\n\t" + "LDR r6, [%[data], #40]\n\t" + "LDR r7, [%[data], #44]\n\t" + "LDR r8, [%[data], #48]\n\t" + "LDR r9, [%[data], #52]\n\t" + "LDR r10, [%[data], #56]\n\t" + "LDR r11, [%[data], #60]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STRD r4, r5, [sp, #32]\n\t" + "STRD r6, r7, [sp, #40]\n\t" + "STRD r8, r9, [sp, #48]\n\t" + "STRD r10, r11, [sp, #56]\n\t" + "LDR r11, [%[sha256], #4]\n\t" + "LDR r4, [%[sha256], #8]\n\t" + "EOR r11, r11, r4\n\t" + "MOV r12, #0x3\n\t" + /* Start of 16 rounds */ + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_SHA256_transform_len_start:\n\t" +#else + "L_SHA256_transform_len_start_%=:\n\t" +#endif + /* Round 0 */ + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r9, [%[sha256], #28]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp]\n\t" + "LDR r6, [r3]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r8, [%[sha256], #12]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #12]\n\t" + "STR r9, [%[sha256], #28]\n\t" + /* Calc new W[0] */ + "LDR r6, [sp, #56]\n\t" + "LDR r7, [sp, #36]\n\t" + "LDR r8, [sp, #4]\n\t" + "LDR r9, [sp]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp]\n\t" + /* Round 1 */ + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r9, [%[sha256], #24]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #4]\n\t" + "LDR r6, [r3, #4]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r8, [%[sha256], #8]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #8]\n\t" + "STR r9, [%[sha256], #24]\n\t" + /* Calc new W[1] */ + "LDR r6, [sp, #60]\n\t" + "LDR r7, [sp, #40]\n\t" + "LDR r8, [sp, #8]\n\t" + "LDR r9, [sp, #4]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #4]\n\t" + /* Round 2 */ + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r9, [%[sha256], #20]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #8]\n\t" + "LDR r6, [r3, #8]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r8, [%[sha256], #4]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #4]\n\t" + "STR r9, [%[sha256], #20]\n\t" + /* Calc new W[2] */ + "LDR r6, [sp]\n\t" + "LDR r7, [sp, #44]\n\t" + "LDR r8, [sp, #12]\n\t" + "LDR r9, [sp, #8]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #8]\n\t" + /* Round 3 */ + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r9, [%[sha256], #16]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #12]\n\t" + "LDR r6, [r3, #12]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r8, [%[sha256]]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256]]\n\t" + "STR r9, [%[sha256], #16]\n\t" + /* Calc new W[3] */ + "LDR r6, [sp, #4]\n\t" + "LDR r7, [sp, #48]\n\t" + "LDR r8, [sp, #16]\n\t" + "LDR r9, [sp, #12]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #12]\n\t" + /* Round 4 */ + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r9, [%[sha256], #12]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #16]\n\t" + "LDR r6, [r3, #16]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r8, [%[sha256], #28]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #28]\n\t" + "STR r9, [%[sha256], #12]\n\t" + /* Calc new W[4] */ + "LDR r6, [sp, #8]\n\t" + "LDR r7, [sp, #52]\n\t" + "LDR r8, [sp, #20]\n\t" + "LDR r9, [sp, #16]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #16]\n\t" + /* Round 5 */ + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r9, [%[sha256], #8]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #20]\n\t" + "LDR r6, [r3, #20]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r8, [%[sha256], #24]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #24]\n\t" + "STR r9, [%[sha256], #8]\n\t" + /* Calc new W[5] */ + "LDR r6, [sp, #12]\n\t" + "LDR r7, [sp, #56]\n\t" + "LDR r8, [sp, #24]\n\t" + "LDR r9, [sp, #20]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #20]\n\t" + /* Round 6 */ + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r9, [%[sha256], #4]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #24]\n\t" + "LDR r6, [r3, #24]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r8, [%[sha256], #20]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #20]\n\t" + "STR r9, [%[sha256], #4]\n\t" + /* Calc new W[6] */ + "LDR r6, [sp, #16]\n\t" + "LDR r7, [sp, #60]\n\t" + "LDR r8, [sp, #28]\n\t" + "LDR r9, [sp, #24]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #24]\n\t" + /* Round 7 */ + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r9, [%[sha256]]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #28]\n\t" + "LDR r6, [r3, #28]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r8, [%[sha256], #16]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #16]\n\t" + "STR r9, [%[sha256]]\n\t" + /* Calc new W[7] */ + "LDR r6, [sp, #20]\n\t" + "LDR r7, [sp]\n\t" + "LDR r8, [sp, #32]\n\t" + "LDR r9, [sp, #28]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #28]\n\t" + /* Round 8 */ + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r9, [%[sha256], #28]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #32]\n\t" + "LDR r6, [r3, #32]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r8, [%[sha256], #12]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #12]\n\t" + "STR r9, [%[sha256], #28]\n\t" + /* Calc new W[8] */ + "LDR r6, [sp, #24]\n\t" + "LDR r7, [sp, #4]\n\t" + "LDR r8, [sp, #36]\n\t" + "LDR r9, [sp, #32]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #32]\n\t" + /* Round 9 */ + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r9, [%[sha256], #24]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #36]\n\t" + "LDR r6, [r3, #36]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r8, [%[sha256], #8]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #8]\n\t" + "STR r9, [%[sha256], #24]\n\t" + /* Calc new W[9] */ + "LDR r6, [sp, #28]\n\t" + "LDR r7, [sp, #8]\n\t" + "LDR r8, [sp, #40]\n\t" + "LDR r9, [sp, #36]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #36]\n\t" + /* Round 10 */ + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r9, [%[sha256], #20]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #40]\n\t" + "LDR r6, [r3, #40]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r8, [%[sha256], #4]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #4]\n\t" + "STR r9, [%[sha256], #20]\n\t" + /* Calc new W[10] */ + "LDR r6, [sp, #32]\n\t" + "LDR r7, [sp, #12]\n\t" + "LDR r8, [sp, #44]\n\t" + "LDR r9, [sp, #40]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #40]\n\t" + /* Round 11 */ + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r9, [%[sha256], #16]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #44]\n\t" + "LDR r6, [r3, #44]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r8, [%[sha256]]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256]]\n\t" + "STR r9, [%[sha256], #16]\n\t" + /* Calc new W[11] */ + "LDR r6, [sp, #36]\n\t" + "LDR r7, [sp, #16]\n\t" + "LDR r8, [sp, #48]\n\t" + "LDR r9, [sp, #44]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #44]\n\t" + /* Round 12 */ + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r9, [%[sha256], #12]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #48]\n\t" + "LDR r6, [r3, #48]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r8, [%[sha256], #28]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #28]\n\t" + "STR r9, [%[sha256], #12]\n\t" + /* Calc new W[12] */ + "LDR r6, [sp, #40]\n\t" + "LDR r7, [sp, #20]\n\t" + "LDR r8, [sp, #52]\n\t" + "LDR r9, [sp, #48]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #48]\n\t" + /* Round 13 */ + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r9, [%[sha256], #8]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #52]\n\t" + "LDR r6, [r3, #52]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r8, [%[sha256], #24]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #24]\n\t" + "STR r9, [%[sha256], #8]\n\t" + /* Calc new W[13] */ + "LDR r6, [sp, #44]\n\t" + "LDR r7, [sp, #24]\n\t" + "LDR r8, [sp, #56]\n\t" + "LDR r9, [sp, #52]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #52]\n\t" + /* Round 14 */ + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r9, [%[sha256], #4]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #56]\n\t" + "LDR r6, [r3, #56]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r8, [%[sha256], #20]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #20]\n\t" + "STR r9, [%[sha256], #4]\n\t" + /* Calc new W[14] */ + "LDR r6, [sp, #48]\n\t" + "LDR r7, [sp, #28]\n\t" + "LDR r8, [sp, #60]\n\t" + "LDR r9, [sp, #56]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #56]\n\t" + /* Round 15 */ + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r9, [%[sha256]]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #60]\n\t" + "LDR r6, [r3, #60]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r8, [%[sha256], #16]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #16]\n\t" + "STR r9, [%[sha256]]\n\t" + /* Calc new W[15] */ + "LDR r6, [sp, #52]\n\t" + "LDR r7, [sp, #32]\n\t" + "LDR r8, [sp]\n\t" + "LDR r9, [sp, #60]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #60]\n\t" + "ADD r3, r3, #0x40\n\t" + "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_SHA256_transform_len_start_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_SHA256_transform_len_start\n\t" +#else + "BNE.W L_SHA256_transform_len_start_%=\n\t" +#endif + /* Round 0 */ + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r9, [%[sha256], #28]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp]\n\t" + "LDR r6, [r3]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r8, [%[sha256], #12]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #12]\n\t" + "STR r9, [%[sha256], #28]\n\t" + /* Round 1 */ + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r9, [%[sha256], #24]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #4]\n\t" + "LDR r6, [r3, #4]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r8, [%[sha256], #8]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #8]\n\t" + "STR r9, [%[sha256], #24]\n\t" + /* Round 2 */ + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r9, [%[sha256], #20]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #8]\n\t" + "LDR r6, [r3, #8]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r8, [%[sha256], #4]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #4]\n\t" + "STR r9, [%[sha256], #20]\n\t" + /* Round 3 */ + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r9, [%[sha256], #16]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #12]\n\t" + "LDR r6, [r3, #12]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r8, [%[sha256]]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256]]\n\t" + "STR r9, [%[sha256], #16]\n\t" + /* Round 4 */ + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r9, [%[sha256], #12]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #16]\n\t" + "LDR r6, [r3, #16]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r8, [%[sha256], #28]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #28]\n\t" + "STR r9, [%[sha256], #12]\n\t" + /* Round 5 */ + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r9, [%[sha256], #8]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #20]\n\t" + "LDR r6, [r3, #20]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r8, [%[sha256], #24]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #24]\n\t" + "STR r9, [%[sha256], #8]\n\t" + /* Round 6 */ + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r9, [%[sha256], #4]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #24]\n\t" + "LDR r6, [r3, #24]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r8, [%[sha256], #20]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #20]\n\t" + "STR r9, [%[sha256], #4]\n\t" + /* Round 7 */ + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r9, [%[sha256]]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #28]\n\t" + "LDR r6, [r3, #28]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r8, [%[sha256], #16]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #16]\n\t" + "STR r9, [%[sha256]]\n\t" + /* Round 8 */ + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r9, [%[sha256], #28]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #32]\n\t" + "LDR r6, [r3, #32]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r8, [%[sha256], #12]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #12]\n\t" + "STR r9, [%[sha256], #28]\n\t" + /* Round 9 */ + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r9, [%[sha256], #24]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #36]\n\t" + "LDR r6, [r3, #36]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r8, [%[sha256], #8]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #8]\n\t" + "STR r9, [%[sha256], #24]\n\t" + /* Round 10 */ + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r9, [%[sha256], #20]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #40]\n\t" + "LDR r6, [r3, #40]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r8, [%[sha256], #4]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #4]\n\t" + "STR r9, [%[sha256], #20]\n\t" + /* Round 11 */ + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r9, [%[sha256], #16]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #44]\n\t" + "LDR r6, [r3, #44]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r8, [%[sha256]]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256]]\n\t" + "STR r9, [%[sha256], #16]\n\t" + /* Round 12 */ + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r9, [%[sha256], #12]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #48]\n\t" + "LDR r6, [r3, #48]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r8, [%[sha256], #28]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #28]\n\t" + "STR r9, [%[sha256], #12]\n\t" + /* Round 13 */ + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r9, [%[sha256], #8]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #52]\n\t" + "LDR r6, [r3, #52]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r8, [%[sha256], #24]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #24]\n\t" + "STR r9, [%[sha256], #8]\n\t" + /* Round 14 */ + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r9, [%[sha256], #4]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #56]\n\t" + "LDR r6, [r3, #56]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r8, [%[sha256], #20]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #20]\n\t" + "STR r9, [%[sha256], #4]\n\t" + /* Round 15 */ + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r9, [%[sha256]]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #60]\n\t" + "LDR r6, [r3, #60]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r8, [%[sha256], #16]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #16]\n\t" + "STR r9, [%[sha256]]\n\t" + /* Add in digest from start */ + "LDRD r4, r5, [%[sha256]]\n\t" + "LDRD r6, r7, [%[sha256], #8]\n\t" + "LDRD r8, r9, [sp, #64]\n\t" + "LDRD r10, r11, [sp, #72]\n\t" + "ADD r4, r4, r8\n\t" + "ADD r5, r5, r9\n\t" + "ADD r6, r6, r10\n\t" + "ADD r7, r7, r11\n\t" + "STRD r4, r5, [%[sha256]]\n\t" + "STRD r6, r7, [%[sha256], #8]\n\t" + "STRD r4, r5, [sp, #64]\n\t" + "STRD r6, r7, [sp, #72]\n\t" + "LDRD r4, r5, [%[sha256], #16]\n\t" + "LDRD r6, r7, [%[sha256], #24]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "LDRD r10, r11, [sp, #88]\n\t" + "ADD r4, r4, r8\n\t" + "ADD r5, r5, r9\n\t" + "ADD r6, r6, r10\n\t" + "ADD r7, r7, r11\n\t" + "STRD r4, r5, [%[sha256], #16]\n\t" + "STRD r6, r7, [%[sha256], #24]\n\t" + "STRD r4, r5, [sp, #80]\n\t" + "STRD r6, r7, [sp, #88]\n\t" + "SUBS %[len], %[len], #0x40\n\t" + "SUB r3, r3, #0xc0\n\t" + "ADD %[data], %[data], #0x40\n\t" +#if defined(__GNUC__) + "BNE L_SHA256_transform_len_begin_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_SHA256_transform_len_begin\n\t" +#else + "BNE.W L_SHA256_transform_len_begin_%=\n\t" +#endif + "ADD sp, sp, #0xc0\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), + [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc" +#else + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len) + : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k) + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* !NO_SHA256 */ +#endif /* WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha3-asm.S b/wolfcrypt/src/port/arm/thumb2-sha3-asm.S new file mode 100644 index 000000000..a04b5adb8 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha3-asm.S @@ -0,0 +1,1176 @@ +/* thumb2-sha3-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha3/sha3.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha3-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#ifdef WOLFSSL_ARMASM_THUMB2 +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifdef WOLFSSL_SHA3 + .text + .type L_sha3_thumb2_rt, %object + .size L_sha3_thumb2_rt, 192 + .align 8 +L_sha3_thumb2_rt: + .word 0x1 + .word 0x0 + .word 0x8082 + .word 0x0 + .word 0x808a + .word 0x80000000 + .word 0x80008000 + .word 0x80000000 + .word 0x808b + .word 0x0 + .word 0x80000001 + .word 0x0 + .word 0x80008081 + .word 0x80000000 + .word 0x8009 + .word 0x80000000 + .word 0x8a + .word 0x0 + .word 0x88 + .word 0x0 + .word 0x80008009 + .word 0x0 + .word 0x8000000a + .word 0x0 + .word 0x8000808b + .word 0x0 + .word 0x8b + .word 0x80000000 + .word 0x8089 + .word 0x80000000 + .word 0x8003 + .word 0x80000000 + .word 0x8002 + .word 0x80000000 + .word 0x80 + .word 0x80000000 + .word 0x800a + .word 0x0 + .word 0x8000000a + .word 0x80000000 + .word 0x80008081 + .word 0x80000000 + .word 0x8080 + .word 0x80000000 + .word 0x80000001 + .word 0x0 + .word 0x80008008 + .word 0x80000000 + .text + .align 4 + .globl BlockSha3 + .type BlockSha3, %function +BlockSha3: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xcc + ADR r1, L_sha3_thumb2_rt + MOV r2, #0xc +L_sha3_thumb2_begin: + STR r2, [sp, #200] + /* Round even */ + /* Calc b[4] */ + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #72] + LDRD r8, r9, [r0, #112] + LDRD r10, r11, [r0, #152] + LDR r12, [r0, #192] + LDR lr, [r0, #196] + EOR r2, r4, r6 + EOR r3, r5, r7 + EOR r2, r2, r8 + EOR r3, r3, r9 + EOR r2, r2, r10 + EOR r3, r3, r11 + EOR r2, r2, r12 + EOR r3, r3, lr + STRD r2, r3, [sp, #32] + /* Calc b[1] */ + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #88] + LDRD r10, r11, [r0, #128] + LDR r12, [r0, #168] + LDR lr, [r0, #172] + EOR r4, r4, r6 + EOR r5, r5, r7 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r4, r4, r10 + EOR r5, r5, r11 + EOR r4, r4, r12 + EOR r5, r5, lr + STRD r4, r5, [sp, #8] + /* Calc t[0] */ + EOR r2, r2, r5, LSR #31 + EOR r3, r3, r4, LSR #31 + EOR r2, r2, r4, LSL #1 + EOR r3, r3, r5, LSL #1 + /* Calc b[0] and XOR t[0] into s[x*5+0] */ + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #80] + LDRD r10, r11, [r0, #120] + EOR r12, r4, r6 + EOR lr, r5, r7 + EOR r12, r12, r8 + EOR lr, lr, r9 + EOR r12, r12, r10 + EOR lr, lr, r11 + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + STRD r4, r5, [r0] + STRD r6, r7, [r0, #40] + STRD r8, r9, [r0, #80] + STRD r10, r11, [r0, #120] + LDRD r10, r11, [r0, #160] + EOR r12, r12, r10 + EOR lr, lr, r11 + EOR r10, r10, r2 + EOR r11, r11, r3 + STRD r10, r11, [r0, #160] + STR r12, [sp] + STR lr, [sp, #4] + /* Calc b[3] */ + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #64] + LDRD r8, r9, [r0, #104] + LDRD r10, r11, [r0, #144] + LDR r12, [r0, #184] + LDR lr, [r0, #188] + EOR r4, r4, r6 + EOR r5, r5, r7 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r4, r4, r10 + EOR r5, r5, r11 + EOR r4, r4, r12 + EOR r5, r5, lr + STRD r4, r5, [sp, #24] + /* Calc t[2] */ + LDRD r2, r3, [sp, #8] + EOR r2, r2, r5, LSR #31 + EOR r3, r3, r4, LSR #31 + EOR r2, r2, r4, LSL #1 + EOR r3, r3, r5, LSL #1 + /* Calc b[2] and XOR t[2] into s[x*5+2] */ + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0, #96] + LDRD r10, r11, [r0, #136] + EOR r12, r4, r6 + EOR lr, r5, r7 + EOR r12, r12, r8 + EOR lr, lr, r9 + EOR r12, r12, r10 + EOR lr, lr, r11 + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + STRD r4, r5, [r0, #16] + STRD r6, r7, [r0, #56] + STRD r8, r9, [r0, #96] + STRD r10, r11, [r0, #136] + LDRD r10, r11, [r0, #176] + EOR r12, r12, r10 + EOR lr, lr, r11 + EOR r10, r10, r2 + EOR r11, r11, r3 + STRD r10, r11, [r0, #176] + STR r12, [sp, #16] + STR lr, [sp, #20] + /* Calc t[1] */ + LDRD r2, r3, [sp] + EOR r2, r2, lr, LSR #31 + EOR r3, r3, r12, LSR #31 + EOR r2, r2, r12, LSL #1 + EOR r3, r3, lr, LSL #1 + /* XOR t[1] into s[x*5+1] */ + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #88] + LDRD r10, r11, [r0, #128] + LDR r12, [r0, #168] + LDR lr, [r0, #172] + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + EOR r12, r12, r2 + EOR lr, lr, r3 + STRD r4, r5, [r0, #8] + STRD r6, r7, [r0, #48] + STRD r8, r9, [r0, #88] + STRD r10, r11, [r0, #128] + STR r12, [r0, #168] + STR lr, [r0, #172] + /* Calc t[3] */ + LDRD r2, r3, [sp, #16] + LDRD r4, r5, [sp, #32] + EOR r2, r2, r5, LSR #31 + EOR r3, r3, r4, LSR #31 + EOR r2, r2, r4, LSL #1 + EOR r3, r3, r5, LSL #1 + /* XOR t[3] into s[x*5+3] */ + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #64] + LDRD r8, r9, [r0, #104] + LDRD r10, r11, [r0, #144] + LDR r12, [r0, #184] + LDR lr, [r0, #188] + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + EOR r12, r12, r2 + EOR lr, lr, r3 + STRD r4, r5, [r0, #24] + STRD r6, r7, [r0, #64] + STRD r8, r9, [r0, #104] + STRD r10, r11, [r0, #144] + STR r12, [r0, #184] + STR lr, [r0, #188] + /* Calc t[4] */ + LDRD r2, r3, [sp, #24] + LDRD r4, r5, [sp] + EOR r2, r2, r5, LSR #31 + EOR r3, r3, r4, LSR #31 + EOR r2, r2, r4, LSL #1 + EOR r3, r3, r5, LSL #1 + /* XOR t[4] into s[x*5+4] */ + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #72] + LDRD r8, r9, [r0, #112] + LDRD r10, r11, [r0, #152] + LDR r12, [r0, #192] + LDR lr, [r0, #196] + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + EOR r12, r12, r2 + EOR lr, lr, r3 + STRD r4, r5, [r0, #32] + STRD r6, r7, [r0, #72] + STRD r8, r9, [r0, #112] + STRD r10, r11, [r0, #152] + STR r12, [r0, #192] + STR lr, [r0, #196] + /* Row Mix */ + /* Row 0 */ + LDRD r2, r3, [r0] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #96] + LDRD r8, r9, [r0, #144] + LDRD r10, r11, [r0, #192] + /* s[1] <<< 44 */ + MOV lr, r4 + LSR r12, r5, #20 + LSR r4, r4, #20 + ORR r4, r4, r5, LSL #12 + ORR r5, r12, lr, LSL #12 + /* s[2] <<< 43 */ + MOV lr, r6 + LSR r12, r7, #21 + LSR r6, r6, #21 + ORR r6, r6, r7, LSL #11 + ORR r7, r12, lr, LSL #11 + /* s[3] <<< 21 */ + LSR r12, r9, #11 + LSR lr, r8, #11 + ORR r8, r12, r8, LSL #21 + ORR r9, lr, r9, LSL #21 + /* s[4] <<< 14 */ + LSR r12, r11, #18 + LSR lr, r10, #18 + ORR r10, r12, r10, LSL #14 + ORR r11, lr, r11, LSL #14 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [sp, #8] + STR lr, [sp, #12] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [sp, #16] + STR lr, [sp, #20] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [sp, #24] + STR lr, [sp, #28] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [sp, #32] + STR lr, [sp, #36] + /* Get constant */ + LDRD r10, r11, [r1] + ADD r1, r1, #0x8 + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + /* XOR in constant */ + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [sp] + STR lr, [sp, #4] + /* Row 1 */ + LDRD r2, r3, [r0, #24] + LDRD r4, r5, [r0, #72] + LDRD r6, r7, [r0, #80] + LDRD r8, r9, [r0, #128] + LDRD r10, r11, [r0, #176] + /* s[0] <<< 28 */ + LSR r12, r3, #4 + LSR lr, r2, #4 + ORR r2, r12, r2, LSL #28 + ORR r3, lr, r3, LSL #28 + /* s[1] <<< 20 */ + LSR r12, r5, #12 + LSR lr, r4, #12 + ORR r4, r12, r4, LSL #20 + ORR r5, lr, r5, LSL #20 + /* s[2] <<< 3 */ + LSR r12, r7, #29 + LSR lr, r6, #29 + ORR r6, r12, r6, LSL #3 + ORR r7, lr, r7, LSL #3 + /* s[3] <<< 45 */ + MOV lr, r8 + LSR r12, r9, #19 + LSR r8, r8, #19 + ORR r8, r8, r9, LSL #13 + ORR r9, r12, lr, LSL #13 + /* s[4] <<< 61 */ + MOV lr, r10 + LSR r12, r11, #3 + LSR r10, r10, #3 + ORR r10, r10, r11, LSL #29 + ORR r11, r12, lr, LSL #29 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [sp, #48] + STR lr, [sp, #52] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [sp, #56] + STR lr, [sp, #60] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [sp, #64] + STR lr, [sp, #68] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [sp, #72] + STR lr, [sp, #76] + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + STR r12, [sp, #40] + STR lr, [sp, #44] + /* Row 2 */ + LDRD r2, r3, [r0, #8] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0, #104] + LDRD r8, r9, [r0, #152] + LDRD r10, r11, [r0, #160] + /* s[0] <<< 1 */ + LSR r12, r3, #31 + LSR lr, r2, #31 + ORR r2, r12, r2, LSL #1 + ORR r3, lr, r3, LSL #1 + /* s[1] <<< 6 */ + LSR r12, r5, #26 + LSR lr, r4, #26 + ORR r4, r12, r4, LSL #6 + ORR r5, lr, r5, LSL #6 + /* s[2] <<< 25 */ + LSR r12, r7, #7 + LSR lr, r6, #7 + ORR r6, r12, r6, LSL #25 + ORR r7, lr, r7, LSL #25 + /* s[3] <<< 8 */ + LSR r12, r9, #24 + LSR lr, r8, #24 + ORR r8, r12, r8, LSL #8 + ORR r9, lr, r9, LSL #8 + /* s[4] <<< 18 */ + LSR r12, r11, #14 + LSR lr, r10, #14 + ORR r10, r12, r10, LSL #18 + ORR r11, lr, r11, LSL #18 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [sp, #88] + STR lr, [sp, #92] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [sp, #96] + STR lr, [sp, #100] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [sp, #104] + STR lr, [sp, #108] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [sp, #112] + STR lr, [sp, #116] + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + STR r12, [sp, #80] + STR lr, [sp, #84] + /* Row 3 */ + LDRD r2, r3, [r0, #32] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #88] + LDRD r8, r9, [r0, #136] + LDRD r10, r11, [r0, #184] + /* s[0] <<< 27 */ + LSR r12, r3, #5 + LSR lr, r2, #5 + ORR r2, r12, r2, LSL #27 + ORR r3, lr, r3, LSL #27 + /* s[1] <<< 36 */ + MOV lr, r4 + LSR r12, r5, #28 + LSR r4, r4, #28 + ORR r4, r4, r5, LSL #4 + ORR r5, r12, lr, LSL #4 + /* s[2] <<< 10 */ + LSR r12, r7, #22 + LSR lr, r6, #22 + ORR r6, r12, r6, LSL #10 + ORR r7, lr, r7, LSL #10 + /* s[3] <<< 15 */ + LSR r12, r9, #17 + LSR lr, r8, #17 + ORR r8, r12, r8, LSL #15 + ORR r9, lr, r9, LSL #15 + /* s[4] <<< 56 */ + MOV lr, r10 + LSR r12, r11, #8 + LSR r10, r10, #8 + ORR r10, r10, r11, LSL #24 + ORR r11, r12, lr, LSL #24 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [sp, #128] + STR lr, [sp, #132] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [sp, #136] + STR lr, [sp, #140] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [sp, #144] + STR lr, [sp, #148] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [sp, #152] + STR lr, [sp, #156] + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + STR r12, [sp, #120] + STR lr, [sp, #124] + /* Row 4 */ + LDRD r2, r3, [r0, #16] + LDRD r4, r5, [r0, #64] + LDRD r6, r7, [r0, #112] + LDRD r8, r9, [r0, #120] + LDRD r10, r11, [r0, #168] + /* s[0] <<< 62 */ + MOV lr, r2 + LSR r12, r3, #2 + LSR r2, r2, #2 + ORR r2, r2, r3, LSL #30 + ORR r3, r12, lr, LSL #30 + /* s[1] <<< 55 */ + MOV lr, r4 + LSR r12, r5, #9 + LSR r4, r4, #9 + ORR r4, r4, r5, LSL #23 + ORR r5, r12, lr, LSL #23 + /* s[2] <<< 39 */ + MOV lr, r6 + LSR r12, r7, #25 + LSR r6, r6, #25 + ORR r6, r6, r7, LSL #7 + ORR r7, r12, lr, LSL #7 + /* s[3] <<< 41 */ + MOV lr, r8 + LSR r12, r9, #23 + LSR r8, r8, #23 + ORR r8, r8, r9, LSL #9 + ORR r9, r12, lr, LSL #9 + /* s[4] <<< 2 */ + LSR r12, r11, #30 + LSR lr, r10, #30 + ORR r10, r12, r10, LSL #2 + ORR r11, lr, r11, LSL #2 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [sp, #168] + STR lr, [sp, #172] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [sp, #176] + STR lr, [sp, #180] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [sp, #184] + STR lr, [sp, #188] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [sp, #192] + STR lr, [sp, #196] + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + STR r12, [sp, #160] + STR lr, [sp, #164] + /* Round odd */ + /* Calc b[4] */ + LDRD r4, r5, [sp, #32] + LDRD r6, r7, [sp, #72] + LDRD r8, r9, [sp, #112] + LDRD r10, r11, [sp, #152] + LDR r12, [sp, #192] + LDR lr, [sp, #196] + EOR r2, r4, r6 + EOR r3, r5, r7 + EOR r2, r2, r8 + EOR r3, r3, r9 + EOR r2, r2, r10 + EOR r3, r3, r11 + EOR r2, r2, r12 + EOR r3, r3, lr + STRD r2, r3, [r0, #32] + /* Calc b[1] */ + LDRD r4, r5, [sp, #8] + LDRD r6, r7, [sp, #48] + LDRD r8, r9, [sp, #88] + LDRD r10, r11, [sp, #128] + LDR r12, [sp, #168] + LDR lr, [sp, #172] + EOR r4, r4, r6 + EOR r5, r5, r7 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r4, r4, r10 + EOR r5, r5, r11 + EOR r4, r4, r12 + EOR r5, r5, lr + STRD r4, r5, [r0, #8] + /* Calc t[0] */ + EOR r2, r2, r5, LSR #31 + EOR r3, r3, r4, LSR #31 + EOR r2, r2, r4, LSL #1 + EOR r3, r3, r5, LSL #1 + /* Calc b[0] and XOR t[0] into s[x*5+0] */ + LDRD r4, r5, [sp] + LDRD r6, r7, [sp, #40] + LDRD r8, r9, [sp, #80] + LDRD r10, r11, [sp, #120] + EOR r12, r4, r6 + EOR lr, r5, r7 + EOR r12, r12, r8 + EOR lr, lr, r9 + EOR r12, r12, r10 + EOR lr, lr, r11 + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + STRD r4, r5, [sp] + STRD r6, r7, [sp, #40] + STRD r8, r9, [sp, #80] + STRD r10, r11, [sp, #120] + LDRD r10, r11, [sp, #160] + EOR r12, r12, r10 + EOR lr, lr, r11 + EOR r10, r10, r2 + EOR r11, r11, r3 + STRD r10, r11, [sp, #160] + STR r12, [r0] + STR lr, [r0, #4] + /* Calc b[3] */ + LDRD r4, r5, [sp, #24] + LDRD r6, r7, [sp, #64] + LDRD r8, r9, [sp, #104] + LDRD r10, r11, [sp, #144] + LDR r12, [sp, #184] + LDR lr, [sp, #188] + EOR r4, r4, r6 + EOR r5, r5, r7 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r4, r4, r10 + EOR r5, r5, r11 + EOR r4, r4, r12 + EOR r5, r5, lr + STRD r4, r5, [r0, #24] + /* Calc t[2] */ + LDRD r2, r3, [r0, #8] + EOR r2, r2, r5, LSR #31 + EOR r3, r3, r4, LSR #31 + EOR r2, r2, r4, LSL #1 + EOR r3, r3, r5, LSL #1 + /* Calc b[2] and XOR t[2] into s[x*5+2] */ + LDRD r4, r5, [sp, #16] + LDRD r6, r7, [sp, #56] + LDRD r8, r9, [sp, #96] + LDRD r10, r11, [sp, #136] + EOR r12, r4, r6 + EOR lr, r5, r7 + EOR r12, r12, r8 + EOR lr, lr, r9 + EOR r12, r12, r10 + EOR lr, lr, r11 + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + STRD r4, r5, [sp, #16] + STRD r6, r7, [sp, #56] + STRD r8, r9, [sp, #96] + STRD r10, r11, [sp, #136] + LDRD r10, r11, [sp, #176] + EOR r12, r12, r10 + EOR lr, lr, r11 + EOR r10, r10, r2 + EOR r11, r11, r3 + STRD r10, r11, [sp, #176] + STR r12, [r0, #16] + STR lr, [r0, #20] + /* Calc t[1] */ + LDRD r2, r3, [r0] + EOR r2, r2, lr, LSR #31 + EOR r3, r3, r12, LSR #31 + EOR r2, r2, r12, LSL #1 + EOR r3, r3, lr, LSL #1 + /* XOR t[1] into s[x*5+1] */ + LDRD r4, r5, [sp, #8] + LDRD r6, r7, [sp, #48] + LDRD r8, r9, [sp, #88] + LDRD r10, r11, [sp, #128] + LDR r12, [sp, #168] + LDR lr, [sp, #172] + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + EOR r12, r12, r2 + EOR lr, lr, r3 + STRD r4, r5, [sp, #8] + STRD r6, r7, [sp, #48] + STRD r8, r9, [sp, #88] + STRD r10, r11, [sp, #128] + STR r12, [sp, #168] + STR lr, [sp, #172] + /* Calc t[3] */ + LDRD r2, r3, [r0, #16] + LDRD r4, r5, [r0, #32] + EOR r2, r2, r5, LSR #31 + EOR r3, r3, r4, LSR #31 + EOR r2, r2, r4, LSL #1 + EOR r3, r3, r5, LSL #1 + /* XOR t[3] into s[x*5+3] */ + LDRD r4, r5, [sp, #24] + LDRD r6, r7, [sp, #64] + LDRD r8, r9, [sp, #104] + LDRD r10, r11, [sp, #144] + LDR r12, [sp, #184] + LDR lr, [sp, #188] + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + EOR r12, r12, r2 + EOR lr, lr, r3 + STRD r4, r5, [sp, #24] + STRD r6, r7, [sp, #64] + STRD r8, r9, [sp, #104] + STRD r10, r11, [sp, #144] + STR r12, [sp, #184] + STR lr, [sp, #188] + /* Calc t[4] */ + LDRD r2, r3, [r0, #24] + LDRD r4, r5, [r0] + EOR r2, r2, r5, LSR #31 + EOR r3, r3, r4, LSR #31 + EOR r2, r2, r4, LSL #1 + EOR r3, r3, r5, LSL #1 + /* XOR t[4] into s[x*5+4] */ + LDRD r4, r5, [sp, #32] + LDRD r6, r7, [sp, #72] + LDRD r8, r9, [sp, #112] + LDRD r10, r11, [sp, #152] + LDR r12, [sp, #192] + LDR lr, [sp, #196] + EOR r4, r4, r2 + EOR r5, r5, r3 + EOR r6, r6, r2 + EOR r7, r7, r3 + EOR r8, r8, r2 + EOR r9, r9, r3 + EOR r10, r10, r2 + EOR r11, r11, r3 + EOR r12, r12, r2 + EOR lr, lr, r3 + STRD r4, r5, [sp, #32] + STRD r6, r7, [sp, #72] + STRD r8, r9, [sp, #112] + STRD r10, r11, [sp, #152] + STR r12, [sp, #192] + STR lr, [sp, #196] + /* Row Mix */ + /* Row 0 */ + LDRD r2, r3, [sp] + LDRD r4, r5, [sp, #48] + LDRD r6, r7, [sp, #96] + LDRD r8, r9, [sp, #144] + LDRD r10, r11, [sp, #192] + /* s[1] <<< 44 */ + MOV lr, r4 + LSR r12, r5, #20 + LSR r4, r4, #20 + ORR r4, r4, r5, LSL #12 + ORR r5, r12, lr, LSL #12 + /* s[2] <<< 43 */ + MOV lr, r6 + LSR r12, r7, #21 + LSR r6, r6, #21 + ORR r6, r6, r7, LSL #11 + ORR r7, r12, lr, LSL #11 + /* s[3] <<< 21 */ + LSR r12, r9, #11 + LSR lr, r8, #11 + ORR r8, r12, r8, LSL #21 + ORR r9, lr, r9, LSL #21 + /* s[4] <<< 14 */ + LSR r12, r11, #18 + LSR lr, r10, #18 + ORR r10, r12, r10, LSL #14 + ORR r11, lr, r11, LSL #14 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [r0, #8] + STR lr, [r0, #12] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [r0, #16] + STR lr, [r0, #20] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [r0, #24] + STR lr, [r0, #28] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [r0, #32] + STR lr, [r0, #36] + /* Get constant */ + LDRD r10, r11, [r1] + ADD r1, r1, #0x8 + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + /* XOR in constant */ + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [r0] + STR lr, [r0, #4] + /* Row 1 */ + LDRD r2, r3, [sp, #24] + LDRD r4, r5, [sp, #72] + LDRD r6, r7, [sp, #80] + LDRD r8, r9, [sp, #128] + LDRD r10, r11, [sp, #176] + /* s[0] <<< 28 */ + LSR r12, r3, #4 + LSR lr, r2, #4 + ORR r2, r12, r2, LSL #28 + ORR r3, lr, r3, LSL #28 + /* s[1] <<< 20 */ + LSR r12, r5, #12 + LSR lr, r4, #12 + ORR r4, r12, r4, LSL #20 + ORR r5, lr, r5, LSL #20 + /* s[2] <<< 3 */ + LSR r12, r7, #29 + LSR lr, r6, #29 + ORR r6, r12, r6, LSL #3 + ORR r7, lr, r7, LSL #3 + /* s[3] <<< 45 */ + MOV lr, r8 + LSR r12, r9, #19 + LSR r8, r8, #19 + ORR r8, r8, r9, LSL #13 + ORR r9, r12, lr, LSL #13 + /* s[4] <<< 61 */ + MOV lr, r10 + LSR r12, r11, #3 + LSR r10, r10, #3 + ORR r10, r10, r11, LSL #29 + ORR r11, r12, lr, LSL #29 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [r0, #48] + STR lr, [r0, #52] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [r0, #56] + STR lr, [r0, #60] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [r0, #64] + STR lr, [r0, #68] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [r0, #72] + STR lr, [r0, #76] + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + STR r12, [r0, #40] + STR lr, [r0, #44] + /* Row 2 */ + LDRD r2, r3, [sp, #8] + LDRD r4, r5, [sp, #56] + LDRD r6, r7, [sp, #104] + LDRD r8, r9, [sp, #152] + LDRD r10, r11, [sp, #160] + /* s[0] <<< 1 */ + LSR r12, r3, #31 + LSR lr, r2, #31 + ORR r2, r12, r2, LSL #1 + ORR r3, lr, r3, LSL #1 + /* s[1] <<< 6 */ + LSR r12, r5, #26 + LSR lr, r4, #26 + ORR r4, r12, r4, LSL #6 + ORR r5, lr, r5, LSL #6 + /* s[2] <<< 25 */ + LSR r12, r7, #7 + LSR lr, r6, #7 + ORR r6, r12, r6, LSL #25 + ORR r7, lr, r7, LSL #25 + /* s[3] <<< 8 */ + LSR r12, r9, #24 + LSR lr, r8, #24 + ORR r8, r12, r8, LSL #8 + ORR r9, lr, r9, LSL #8 + /* s[4] <<< 18 */ + LSR r12, r11, #14 + LSR lr, r10, #14 + ORR r10, r12, r10, LSL #18 + ORR r11, lr, r11, LSL #18 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [r0, #88] + STR lr, [r0, #92] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [r0, #96] + STR lr, [r0, #100] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [r0, #104] + STR lr, [r0, #108] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [r0, #112] + STR lr, [r0, #116] + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + STR r12, [r0, #80] + STR lr, [r0, #84] + /* Row 3 */ + LDRD r2, r3, [sp, #32] + LDRD r4, r5, [sp, #40] + LDRD r6, r7, [sp, #88] + LDRD r8, r9, [sp, #136] + LDRD r10, r11, [sp, #184] + /* s[0] <<< 27 */ + LSR r12, r3, #5 + LSR lr, r2, #5 + ORR r2, r12, r2, LSL #27 + ORR r3, lr, r3, LSL #27 + /* s[1] <<< 36 */ + MOV lr, r4 + LSR r12, r5, #28 + LSR r4, r4, #28 + ORR r4, r4, r5, LSL #4 + ORR r5, r12, lr, LSL #4 + /* s[2] <<< 10 */ + LSR r12, r7, #22 + LSR lr, r6, #22 + ORR r6, r12, r6, LSL #10 + ORR r7, lr, r7, LSL #10 + /* s[3] <<< 15 */ + LSR r12, r9, #17 + LSR lr, r8, #17 + ORR r8, r12, r8, LSL #15 + ORR r9, lr, r9, LSL #15 + /* s[4] <<< 56 */ + MOV lr, r10 + LSR r12, r11, #8 + LSR r10, r10, #8 + ORR r10, r10, r11, LSL #24 + ORR r11, r12, lr, LSL #24 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [r0, #128] + STR lr, [r0, #132] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [r0, #136] + STR lr, [r0, #140] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [r0, #144] + STR lr, [r0, #148] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [r0, #152] + STR lr, [r0, #156] + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + STR r12, [r0, #120] + STR lr, [r0, #124] + /* Row 4 */ + LDRD r2, r3, [sp, #16] + LDRD r4, r5, [sp, #64] + LDRD r6, r7, [sp, #112] + LDRD r8, r9, [sp, #120] + LDRD r10, r11, [sp, #168] + /* s[0] <<< 62 */ + MOV lr, r2 + LSR r12, r3, #2 + LSR r2, r2, #2 + ORR r2, r2, r3, LSL #30 + ORR r3, r12, lr, LSL #30 + /* s[1] <<< 55 */ + MOV lr, r4 + LSR r12, r5, #9 + LSR r4, r4, #9 + ORR r4, r4, r5, LSL #23 + ORR r5, r12, lr, LSL #23 + /* s[2] <<< 39 */ + MOV lr, r6 + LSR r12, r7, #25 + LSR r6, r6, #25 + ORR r6, r6, r7, LSL #7 + ORR r7, r12, lr, LSL #7 + /* s[3] <<< 41 */ + MOV lr, r8 + LSR r12, r9, #23 + LSR r8, r8, #23 + ORR r8, r8, r9, LSL #9 + ORR r9, r12, lr, LSL #9 + /* s[4] <<< 2 */ + LSR r12, r11, #30 + LSR lr, r10, #30 + ORR r10, r12, r10, LSL #2 + ORR r11, lr, r11, LSL #2 + BIC r12, r8, r6 + BIC lr, r9, r7 + EOR r12, r12, r4 + EOR lr, lr, r5 + STR r12, [r0, #168] + STR lr, [r0, #172] + BIC r12, r10, r8 + BIC lr, r11, r9 + EOR r12, r12, r6 + EOR lr, lr, r7 + STR r12, [r0, #176] + STR lr, [r0, #180] + BIC r12, r2, r10 + BIC lr, r3, r11 + EOR r12, r12, r8 + EOR lr, lr, r9 + STR r12, [r0, #184] + STR lr, [r0, #188] + BIC r12, r4, r2 + BIC lr, r5, r3 + EOR r12, r12, r10 + EOR lr, lr, r11 + STR r12, [r0, #192] + STR lr, [r0, #196] + BIC r12, r6, r4 + BIC lr, r7, r5 + EOR r12, r12, r2 + EOR lr, lr, r3 + STR r12, [r0, #160] + STR lr, [r0, #164] + LDR r2, [sp, #200] + SUBS r2, r2, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_sha3_thumb2_begin +#else + BNE.W L_sha3_thumb2_begin +#endif + ADD sp, sp, #0xcc + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 1505 */ + .size BlockSha3,.-BlockSha3 +#endif /* WOLFSSL_SHA3 */ +#endif /* WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c new file mode 100644 index 000000000..03b564fe7 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha3-asm_c.c @@ -0,0 +1,1168 @@ +/* thumb2-sha3-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha3/sha3.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha3-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#ifdef WOLFSSL_ARMASM_THUMB2 +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef WOLFSSL_SHA3 +static const word64 L_sha3_thumb2_rt[] = { + 0x0000000000000001UL, 0x0000000000008082UL, + 0x800000000000808aUL, 0x8000000080008000UL, + 0x000000000000808bUL, 0x0000000080000001UL, + 0x8000000080008081UL, 0x8000000000008009UL, + 0x000000000000008aUL, 0x0000000000000088UL, + 0x0000000080008009UL, 0x000000008000000aUL, + 0x000000008000808bUL, 0x800000000000008bUL, + 0x8000000000008089UL, 0x8000000000008003UL, + 0x8000000000008002UL, 0x8000000000000080UL, + 0x000000000000800aUL, 0x800000008000000aUL, + 0x8000000080008081UL, 0x8000000000008080UL, + 0x0000000080000001UL, 0x8000000080008008UL, +}; + +#include + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void BlockSha3(word64* state_p) +#else +void BlockSha3(word64* state) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register word64* state __asm__ ("r0") = (word64*)state_p; + register word64* L_sha3_thumb2_rt_c __asm__ ("r1") = (word64*)&L_sha3_thumb2_rt; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "SUB sp, sp, #0xcc\n\t" + "MOV r1, %[L_sha3_thumb2_rt]\n\t" + "MOV r2, #0xc\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_sha3_thumb2_begin:\n\t" +#else + "L_sha3_thumb2_begin_%=:\n\t" +#endif + "STR r2, [sp, #200]\n\t" + /* Round even */ + /* Calc b[4] */ + "LDRD r4, r5, [%[state], #32]\n\t" + "LDRD r6, r7, [%[state], #72]\n\t" + "LDRD r8, r9, [%[state], #112]\n\t" + "LDRD r10, r11, [%[state], #152]\n\t" + "LDR r12, [%[state], #192]\n\t" + "LDR lr, [%[state], #196]\n\t" + "EOR r2, r4, r6\n\t" + "EOR r3, r5, r7\n\t" + "EOR r2, r2, r8\n\t" + "EOR r3, r3, r9\n\t" + "EOR r2, r2, r10\n\t" + "EOR r3, r3, r11\n\t" + "EOR r2, r2, r12\n\t" + "EOR r3, r3, lr\n\t" + "STRD r2, r3, [sp, #32]\n\t" + /* Calc b[1] */ + "LDRD r4, r5, [%[state], #8]\n\t" + "LDRD r6, r7, [%[state], #48]\n\t" + "LDRD r8, r9, [%[state], #88]\n\t" + "LDRD r10, r11, [%[state], #128]\n\t" + "LDR r12, [%[state], #168]\n\t" + "LDR lr, [%[state], #172]\n\t" + "EOR r4, r4, r6\n\t" + "EOR r5, r5, r7\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "EOR r4, r4, r12\n\t" + "EOR r5, r5, lr\n\t" + "STRD r4, r5, [sp, #8]\n\t" + /* Calc t[0] */ + "EOR r2, r2, r5, LSR #31\n\t" + "EOR r3, r3, r4, LSR #31\n\t" + "EOR r2, r2, r4, LSL #1\n\t" + "EOR r3, r3, r5, LSL #1\n\t" + /* Calc b[0] and XOR t[0] into s[x*5+0] */ + "LDRD r4, r5, [%[state]]\n\t" + "LDRD r6, r7, [%[state], #40]\n\t" + "LDRD r8, r9, [%[state], #80]\n\t" + "LDRD r10, r11, [%[state], #120]\n\t" + "EOR r12, r4, r6\n\t" + "EOR lr, r5, r7\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "STRD r4, r5, [%[state]]\n\t" + "STRD r6, r7, [%[state], #40]\n\t" + "STRD r8, r9, [%[state], #80]\n\t" + "STRD r10, r11, [%[state], #120]\n\t" + "LDRD r10, r11, [%[state], #160]\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "STRD r10, r11, [%[state], #160]\n\t" + "STR r12, [sp]\n\t" + "STR lr, [sp, #4]\n\t" + /* Calc b[3] */ + "LDRD r4, r5, [%[state], #24]\n\t" + "LDRD r6, r7, [%[state], #64]\n\t" + "LDRD r8, r9, [%[state], #104]\n\t" + "LDRD r10, r11, [%[state], #144]\n\t" + "LDR r12, [%[state], #184]\n\t" + "LDR lr, [%[state], #188]\n\t" + "EOR r4, r4, r6\n\t" + "EOR r5, r5, r7\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "EOR r4, r4, r12\n\t" + "EOR r5, r5, lr\n\t" + "STRD r4, r5, [sp, #24]\n\t" + /* Calc t[2] */ + "LDRD r2, r3, [sp, #8]\n\t" + "EOR r2, r2, r5, LSR #31\n\t" + "EOR r3, r3, r4, LSR #31\n\t" + "EOR r2, r2, r4, LSL #1\n\t" + "EOR r3, r3, r5, LSL #1\n\t" + /* Calc b[2] and XOR t[2] into s[x*5+2] */ + "LDRD r4, r5, [%[state], #16]\n\t" + "LDRD r6, r7, [%[state], #56]\n\t" + "LDRD r8, r9, [%[state], #96]\n\t" + "LDRD r10, r11, [%[state], #136]\n\t" + "EOR r12, r4, r6\n\t" + "EOR lr, r5, r7\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "STRD r4, r5, [%[state], #16]\n\t" + "STRD r6, r7, [%[state], #56]\n\t" + "STRD r8, r9, [%[state], #96]\n\t" + "STRD r10, r11, [%[state], #136]\n\t" + "LDRD r10, r11, [%[state], #176]\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "STRD r10, r11, [%[state], #176]\n\t" + "STR r12, [sp, #16]\n\t" + "STR lr, [sp, #20]\n\t" + /* Calc t[1] */ + "LDRD r2, r3, [sp]\n\t" + "EOR r2, r2, lr, LSR #31\n\t" + "EOR r3, r3, r12, LSR #31\n\t" + "EOR r2, r2, r12, LSL #1\n\t" + "EOR r3, r3, lr, LSL #1\n\t" + /* XOR t[1] into s[x*5+1] */ + "LDRD r4, r5, [%[state], #8]\n\t" + "LDRD r6, r7, [%[state], #48]\n\t" + "LDRD r8, r9, [%[state], #88]\n\t" + "LDRD r10, r11, [%[state], #128]\n\t" + "LDR r12, [%[state], #168]\n\t" + "LDR lr, [%[state], #172]\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STRD r4, r5, [%[state], #8]\n\t" + "STRD r6, r7, [%[state], #48]\n\t" + "STRD r8, r9, [%[state], #88]\n\t" + "STRD r10, r11, [%[state], #128]\n\t" + "STR r12, [%[state], #168]\n\t" + "STR lr, [%[state], #172]\n\t" + /* Calc t[3] */ + "LDRD r2, r3, [sp, #16]\n\t" + "LDRD r4, r5, [sp, #32]\n\t" + "EOR r2, r2, r5, LSR #31\n\t" + "EOR r3, r3, r4, LSR #31\n\t" + "EOR r2, r2, r4, LSL #1\n\t" + "EOR r3, r3, r5, LSL #1\n\t" + /* XOR t[3] into s[x*5+3] */ + "LDRD r4, r5, [%[state], #24]\n\t" + "LDRD r6, r7, [%[state], #64]\n\t" + "LDRD r8, r9, [%[state], #104]\n\t" + "LDRD r10, r11, [%[state], #144]\n\t" + "LDR r12, [%[state], #184]\n\t" + "LDR lr, [%[state], #188]\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STRD r4, r5, [%[state], #24]\n\t" + "STRD r6, r7, [%[state], #64]\n\t" + "STRD r8, r9, [%[state], #104]\n\t" + "STRD r10, r11, [%[state], #144]\n\t" + "STR r12, [%[state], #184]\n\t" + "STR lr, [%[state], #188]\n\t" + /* Calc t[4] */ + "LDRD r2, r3, [sp, #24]\n\t" + "LDRD r4, r5, [sp]\n\t" + "EOR r2, r2, r5, LSR #31\n\t" + "EOR r3, r3, r4, LSR #31\n\t" + "EOR r2, r2, r4, LSL #1\n\t" + "EOR r3, r3, r5, LSL #1\n\t" + /* XOR t[4] into s[x*5+4] */ + "LDRD r4, r5, [%[state], #32]\n\t" + "LDRD r6, r7, [%[state], #72]\n\t" + "LDRD r8, r9, [%[state], #112]\n\t" + "LDRD r10, r11, [%[state], #152]\n\t" + "LDR r12, [%[state], #192]\n\t" + "LDR lr, [%[state], #196]\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STRD r4, r5, [%[state], #32]\n\t" + "STRD r6, r7, [%[state], #72]\n\t" + "STRD r8, r9, [%[state], #112]\n\t" + "STRD r10, r11, [%[state], #152]\n\t" + "STR r12, [%[state], #192]\n\t" + "STR lr, [%[state], #196]\n\t" + /* Row Mix */ + /* Row 0 */ + "LDRD r2, r3, [%[state]]\n\t" + "LDRD r4, r5, [%[state], #48]\n\t" + "LDRD r6, r7, [%[state], #96]\n\t" + "LDRD r8, r9, [%[state], #144]\n\t" + "LDRD r10, r11, [%[state], #192]\n\t" + /* s[1] <<< 44 */ + "MOV lr, r4\n\t" + "LSR r12, r5, #20\n\t" + "LSR r4, r4, #20\n\t" + "ORR r4, r4, r5, LSL #12\n\t" + "ORR r5, r12, lr, LSL #12\n\t" + /* s[2] <<< 43 */ + "MOV lr, r6\n\t" + "LSR r12, r7, #21\n\t" + "LSR r6, r6, #21\n\t" + "ORR r6, r6, r7, LSL #11\n\t" + "ORR r7, r12, lr, LSL #11\n\t" + /* s[3] <<< 21 */ + "LSR r12, r9, #11\n\t" + "LSR lr, r8, #11\n\t" + "ORR r8, r12, r8, LSL #21\n\t" + "ORR r9, lr, r9, LSL #21\n\t" + /* s[4] <<< 14 */ + "LSR r12, r11, #18\n\t" + "LSR lr, r10, #18\n\t" + "ORR r10, r12, r10, LSL #14\n\t" + "ORR r11, lr, r11, LSL #14\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [sp, #8]\n\t" + "STR lr, [sp, #12]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [sp, #16]\n\t" + "STR lr, [sp, #20]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [sp, #24]\n\t" + "STR lr, [sp, #28]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [sp, #32]\n\t" + "STR lr, [sp, #36]\n\t" + /* Get constant */ + "LDRD r10, r11, [r1]\n\t" + "ADD r1, r1, #0x8\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + /* XOR in constant */ + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [sp]\n\t" + "STR lr, [sp, #4]\n\t" + /* Row 1 */ + "LDRD r2, r3, [%[state], #24]\n\t" + "LDRD r4, r5, [%[state], #72]\n\t" + "LDRD r6, r7, [%[state], #80]\n\t" + "LDRD r8, r9, [%[state], #128]\n\t" + "LDRD r10, r11, [%[state], #176]\n\t" + /* s[0] <<< 28 */ + "LSR r12, r3, #4\n\t" + "LSR lr, r2, #4\n\t" + "ORR r2, r12, r2, LSL #28\n\t" + "ORR r3, lr, r3, LSL #28\n\t" + /* s[1] <<< 20 */ + "LSR r12, r5, #12\n\t" + "LSR lr, r4, #12\n\t" + "ORR r4, r12, r4, LSL #20\n\t" + "ORR r5, lr, r5, LSL #20\n\t" + /* s[2] <<< 3 */ + "LSR r12, r7, #29\n\t" + "LSR lr, r6, #29\n\t" + "ORR r6, r12, r6, LSL #3\n\t" + "ORR r7, lr, r7, LSL #3\n\t" + /* s[3] <<< 45 */ + "MOV lr, r8\n\t" + "LSR r12, r9, #19\n\t" + "LSR r8, r8, #19\n\t" + "ORR r8, r8, r9, LSL #13\n\t" + "ORR r9, r12, lr, LSL #13\n\t" + /* s[4] <<< 61 */ + "MOV lr, r10\n\t" + "LSR r12, r11, #3\n\t" + "LSR r10, r10, #3\n\t" + "ORR r10, r10, r11, LSL #29\n\t" + "ORR r11, r12, lr, LSL #29\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [sp, #48]\n\t" + "STR lr, [sp, #52]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [sp, #56]\n\t" + "STR lr, [sp, #60]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [sp, #64]\n\t" + "STR lr, [sp, #68]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [sp, #72]\n\t" + "STR lr, [sp, #76]\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STR r12, [sp, #40]\n\t" + "STR lr, [sp, #44]\n\t" + /* Row 2 */ + "LDRD r2, r3, [%[state], #8]\n\t" + "LDRD r4, r5, [%[state], #56]\n\t" + "LDRD r6, r7, [%[state], #104]\n\t" + "LDRD r8, r9, [%[state], #152]\n\t" + "LDRD r10, r11, [%[state], #160]\n\t" + /* s[0] <<< 1 */ + "LSR r12, r3, #31\n\t" + "LSR lr, r2, #31\n\t" + "ORR r2, r12, r2, LSL #1\n\t" + "ORR r3, lr, r3, LSL #1\n\t" + /* s[1] <<< 6 */ + "LSR r12, r5, #26\n\t" + "LSR lr, r4, #26\n\t" + "ORR r4, r12, r4, LSL #6\n\t" + "ORR r5, lr, r5, LSL #6\n\t" + /* s[2] <<< 25 */ + "LSR r12, r7, #7\n\t" + "LSR lr, r6, #7\n\t" + "ORR r6, r12, r6, LSL #25\n\t" + "ORR r7, lr, r7, LSL #25\n\t" + /* s[3] <<< 8 */ + "LSR r12, r9, #24\n\t" + "LSR lr, r8, #24\n\t" + "ORR r8, r12, r8, LSL #8\n\t" + "ORR r9, lr, r9, LSL #8\n\t" + /* s[4] <<< 18 */ + "LSR r12, r11, #14\n\t" + "LSR lr, r10, #14\n\t" + "ORR r10, r12, r10, LSL #18\n\t" + "ORR r11, lr, r11, LSL #18\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [sp, #88]\n\t" + "STR lr, [sp, #92]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [sp, #96]\n\t" + "STR lr, [sp, #100]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [sp, #104]\n\t" + "STR lr, [sp, #108]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [sp, #112]\n\t" + "STR lr, [sp, #116]\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STR r12, [sp, #80]\n\t" + "STR lr, [sp, #84]\n\t" + /* Row 3 */ + "LDRD r2, r3, [%[state], #32]\n\t" + "LDRD r4, r5, [%[state], #40]\n\t" + "LDRD r6, r7, [%[state], #88]\n\t" + "LDRD r8, r9, [%[state], #136]\n\t" + "LDRD r10, r11, [%[state], #184]\n\t" + /* s[0] <<< 27 */ + "LSR r12, r3, #5\n\t" + "LSR lr, r2, #5\n\t" + "ORR r2, r12, r2, LSL #27\n\t" + "ORR r3, lr, r3, LSL #27\n\t" + /* s[1] <<< 36 */ + "MOV lr, r4\n\t" + "LSR r12, r5, #28\n\t" + "LSR r4, r4, #28\n\t" + "ORR r4, r4, r5, LSL #4\n\t" + "ORR r5, r12, lr, LSL #4\n\t" + /* s[2] <<< 10 */ + "LSR r12, r7, #22\n\t" + "LSR lr, r6, #22\n\t" + "ORR r6, r12, r6, LSL #10\n\t" + "ORR r7, lr, r7, LSL #10\n\t" + /* s[3] <<< 15 */ + "LSR r12, r9, #17\n\t" + "LSR lr, r8, #17\n\t" + "ORR r8, r12, r8, LSL #15\n\t" + "ORR r9, lr, r9, LSL #15\n\t" + /* s[4] <<< 56 */ + "MOV lr, r10\n\t" + "LSR r12, r11, #8\n\t" + "LSR r10, r10, #8\n\t" + "ORR r10, r10, r11, LSL #24\n\t" + "ORR r11, r12, lr, LSL #24\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [sp, #128]\n\t" + "STR lr, [sp, #132]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [sp, #136]\n\t" + "STR lr, [sp, #140]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [sp, #144]\n\t" + "STR lr, [sp, #148]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [sp, #152]\n\t" + "STR lr, [sp, #156]\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STR r12, [sp, #120]\n\t" + "STR lr, [sp, #124]\n\t" + /* Row 4 */ + "LDRD r2, r3, [%[state], #16]\n\t" + "LDRD r4, r5, [%[state], #64]\n\t" + "LDRD r6, r7, [%[state], #112]\n\t" + "LDRD r8, r9, [%[state], #120]\n\t" + "LDRD r10, r11, [%[state], #168]\n\t" + /* s[0] <<< 62 */ + "MOV lr, r2\n\t" + "LSR r12, r3, #2\n\t" + "LSR r2, r2, #2\n\t" + "ORR r2, r2, r3, LSL #30\n\t" + "ORR r3, r12, lr, LSL #30\n\t" + /* s[1] <<< 55 */ + "MOV lr, r4\n\t" + "LSR r12, r5, #9\n\t" + "LSR r4, r4, #9\n\t" + "ORR r4, r4, r5, LSL #23\n\t" + "ORR r5, r12, lr, LSL #23\n\t" + /* s[2] <<< 39 */ + "MOV lr, r6\n\t" + "LSR r12, r7, #25\n\t" + "LSR r6, r6, #25\n\t" + "ORR r6, r6, r7, LSL #7\n\t" + "ORR r7, r12, lr, LSL #7\n\t" + /* s[3] <<< 41 */ + "MOV lr, r8\n\t" + "LSR r12, r9, #23\n\t" + "LSR r8, r8, #23\n\t" + "ORR r8, r8, r9, LSL #9\n\t" + "ORR r9, r12, lr, LSL #9\n\t" + /* s[4] <<< 2 */ + "LSR r12, r11, #30\n\t" + "LSR lr, r10, #30\n\t" + "ORR r10, r12, r10, LSL #2\n\t" + "ORR r11, lr, r11, LSL #2\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [sp, #168]\n\t" + "STR lr, [sp, #172]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [sp, #176]\n\t" + "STR lr, [sp, #180]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [sp, #184]\n\t" + "STR lr, [sp, #188]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [sp, #192]\n\t" + "STR lr, [sp, #196]\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STR r12, [sp, #160]\n\t" + "STR lr, [sp, #164]\n\t" + /* Round odd */ + /* Calc b[4] */ + "LDRD r4, r5, [sp, #32]\n\t" + "LDRD r6, r7, [sp, #72]\n\t" + "LDRD r8, r9, [sp, #112]\n\t" + "LDRD r10, r11, [sp, #152]\n\t" + "LDR r12, [sp, #192]\n\t" + "LDR lr, [sp, #196]\n\t" + "EOR r2, r4, r6\n\t" + "EOR r3, r5, r7\n\t" + "EOR r2, r2, r8\n\t" + "EOR r3, r3, r9\n\t" + "EOR r2, r2, r10\n\t" + "EOR r3, r3, r11\n\t" + "EOR r2, r2, r12\n\t" + "EOR r3, r3, lr\n\t" + "STRD r2, r3, [%[state], #32]\n\t" + /* Calc b[1] */ + "LDRD r4, r5, [sp, #8]\n\t" + "LDRD r6, r7, [sp, #48]\n\t" + "LDRD r8, r9, [sp, #88]\n\t" + "LDRD r10, r11, [sp, #128]\n\t" + "LDR r12, [sp, #168]\n\t" + "LDR lr, [sp, #172]\n\t" + "EOR r4, r4, r6\n\t" + "EOR r5, r5, r7\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "EOR r4, r4, r12\n\t" + "EOR r5, r5, lr\n\t" + "STRD r4, r5, [%[state], #8]\n\t" + /* Calc t[0] */ + "EOR r2, r2, r5, LSR #31\n\t" + "EOR r3, r3, r4, LSR #31\n\t" + "EOR r2, r2, r4, LSL #1\n\t" + "EOR r3, r3, r5, LSL #1\n\t" + /* Calc b[0] and XOR t[0] into s[x*5+0] */ + "LDRD r4, r5, [sp]\n\t" + "LDRD r6, r7, [sp, #40]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "LDRD r10, r11, [sp, #120]\n\t" + "EOR r12, r4, r6\n\t" + "EOR lr, r5, r7\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "STRD r4, r5, [sp]\n\t" + "STRD r6, r7, [sp, #40]\n\t" + "STRD r8, r9, [sp, #80]\n\t" + "STRD r10, r11, [sp, #120]\n\t" + "LDRD r10, r11, [sp, #160]\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "STRD r10, r11, [sp, #160]\n\t" + "STR r12, [%[state]]\n\t" + "STR lr, [%[state], #4]\n\t" + /* Calc b[3] */ + "LDRD r4, r5, [sp, #24]\n\t" + "LDRD r6, r7, [sp, #64]\n\t" + "LDRD r8, r9, [sp, #104]\n\t" + "LDRD r10, r11, [sp, #144]\n\t" + "LDR r12, [sp, #184]\n\t" + "LDR lr, [sp, #188]\n\t" + "EOR r4, r4, r6\n\t" + "EOR r5, r5, r7\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "EOR r4, r4, r12\n\t" + "EOR r5, r5, lr\n\t" + "STRD r4, r5, [%[state], #24]\n\t" + /* Calc t[2] */ + "LDRD r2, r3, [%[state], #8]\n\t" + "EOR r2, r2, r5, LSR #31\n\t" + "EOR r3, r3, r4, LSR #31\n\t" + "EOR r2, r2, r4, LSL #1\n\t" + "EOR r3, r3, r5, LSL #1\n\t" + /* Calc b[2] and XOR t[2] into s[x*5+2] */ + "LDRD r4, r5, [sp, #16]\n\t" + "LDRD r6, r7, [sp, #56]\n\t" + "LDRD r8, r9, [sp, #96]\n\t" + "LDRD r10, r11, [sp, #136]\n\t" + "EOR r12, r4, r6\n\t" + "EOR lr, r5, r7\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "STRD r4, r5, [sp, #16]\n\t" + "STRD r6, r7, [sp, #56]\n\t" + "STRD r8, r9, [sp, #96]\n\t" + "STRD r10, r11, [sp, #136]\n\t" + "LDRD r10, r11, [sp, #176]\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "STRD r10, r11, [sp, #176]\n\t" + "STR r12, [%[state], #16]\n\t" + "STR lr, [%[state], #20]\n\t" + /* Calc t[1] */ + "LDRD r2, r3, [%[state]]\n\t" + "EOR r2, r2, lr, LSR #31\n\t" + "EOR r3, r3, r12, LSR #31\n\t" + "EOR r2, r2, r12, LSL #1\n\t" + "EOR r3, r3, lr, LSL #1\n\t" + /* XOR t[1] into s[x*5+1] */ + "LDRD r4, r5, [sp, #8]\n\t" + "LDRD r6, r7, [sp, #48]\n\t" + "LDRD r8, r9, [sp, #88]\n\t" + "LDRD r10, r11, [sp, #128]\n\t" + "LDR r12, [sp, #168]\n\t" + "LDR lr, [sp, #172]\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STRD r4, r5, [sp, #8]\n\t" + "STRD r6, r7, [sp, #48]\n\t" + "STRD r8, r9, [sp, #88]\n\t" + "STRD r10, r11, [sp, #128]\n\t" + "STR r12, [sp, #168]\n\t" + "STR lr, [sp, #172]\n\t" + /* Calc t[3] */ + "LDRD r2, r3, [%[state], #16]\n\t" + "LDRD r4, r5, [%[state], #32]\n\t" + "EOR r2, r2, r5, LSR #31\n\t" + "EOR r3, r3, r4, LSR #31\n\t" + "EOR r2, r2, r4, LSL #1\n\t" + "EOR r3, r3, r5, LSL #1\n\t" + /* XOR t[3] into s[x*5+3] */ + "LDRD r4, r5, [sp, #24]\n\t" + "LDRD r6, r7, [sp, #64]\n\t" + "LDRD r8, r9, [sp, #104]\n\t" + "LDRD r10, r11, [sp, #144]\n\t" + "LDR r12, [sp, #184]\n\t" + "LDR lr, [sp, #188]\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STRD r4, r5, [sp, #24]\n\t" + "STRD r6, r7, [sp, #64]\n\t" + "STRD r8, r9, [sp, #104]\n\t" + "STRD r10, r11, [sp, #144]\n\t" + "STR r12, [sp, #184]\n\t" + "STR lr, [sp, #188]\n\t" + /* Calc t[4] */ + "LDRD r2, r3, [%[state], #24]\n\t" + "LDRD r4, r5, [%[state]]\n\t" + "EOR r2, r2, r5, LSR #31\n\t" + "EOR r3, r3, r4, LSR #31\n\t" + "EOR r2, r2, r4, LSL #1\n\t" + "EOR r3, r3, r5, LSL #1\n\t" + /* XOR t[4] into s[x*5+4] */ + "LDRD r4, r5, [sp, #32]\n\t" + "LDRD r6, r7, [sp, #72]\n\t" + "LDRD r8, r9, [sp, #112]\n\t" + "LDRD r10, r11, [sp, #152]\n\t" + "LDR r12, [sp, #192]\n\t" + "LDR lr, [sp, #196]\n\t" + "EOR r4, r4, r2\n\t" + "EOR r5, r5, r3\n\t" + "EOR r6, r6, r2\n\t" + "EOR r7, r7, r3\n\t" + "EOR r8, r8, r2\n\t" + "EOR r9, r9, r3\n\t" + "EOR r10, r10, r2\n\t" + "EOR r11, r11, r3\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STRD r4, r5, [sp, #32]\n\t" + "STRD r6, r7, [sp, #72]\n\t" + "STRD r8, r9, [sp, #112]\n\t" + "STRD r10, r11, [sp, #152]\n\t" + "STR r12, [sp, #192]\n\t" + "STR lr, [sp, #196]\n\t" + /* Row Mix */ + /* Row 0 */ + "LDRD r2, r3, [sp]\n\t" + "LDRD r4, r5, [sp, #48]\n\t" + "LDRD r6, r7, [sp, #96]\n\t" + "LDRD r8, r9, [sp, #144]\n\t" + "LDRD r10, r11, [sp, #192]\n\t" + /* s[1] <<< 44 */ + "MOV lr, r4\n\t" + "LSR r12, r5, #20\n\t" + "LSR r4, r4, #20\n\t" + "ORR r4, r4, r5, LSL #12\n\t" + "ORR r5, r12, lr, LSL #12\n\t" + /* s[2] <<< 43 */ + "MOV lr, r6\n\t" + "LSR r12, r7, #21\n\t" + "LSR r6, r6, #21\n\t" + "ORR r6, r6, r7, LSL #11\n\t" + "ORR r7, r12, lr, LSL #11\n\t" + /* s[3] <<< 21 */ + "LSR r12, r9, #11\n\t" + "LSR lr, r8, #11\n\t" + "ORR r8, r12, r8, LSL #21\n\t" + "ORR r9, lr, r9, LSL #21\n\t" + /* s[4] <<< 14 */ + "LSR r12, r11, #18\n\t" + "LSR lr, r10, #18\n\t" + "ORR r10, r12, r10, LSL #14\n\t" + "ORR r11, lr, r11, LSL #14\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [%[state], #8]\n\t" + "STR lr, [%[state], #12]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [%[state], #16]\n\t" + "STR lr, [%[state], #20]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [%[state], #24]\n\t" + "STR lr, [%[state], #28]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [%[state], #32]\n\t" + "STR lr, [%[state], #36]\n\t" + /* Get constant */ + "LDRD r10, r11, [r1]\n\t" + "ADD r1, r1, #0x8\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + /* XOR in constant */ + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [%[state]]\n\t" + "STR lr, [%[state], #4]\n\t" + /* Row 1 */ + "LDRD r2, r3, [sp, #24]\n\t" + "LDRD r4, r5, [sp, #72]\n\t" + "LDRD r6, r7, [sp, #80]\n\t" + "LDRD r8, r9, [sp, #128]\n\t" + "LDRD r10, r11, [sp, #176]\n\t" + /* s[0] <<< 28 */ + "LSR r12, r3, #4\n\t" + "LSR lr, r2, #4\n\t" + "ORR r2, r12, r2, LSL #28\n\t" + "ORR r3, lr, r3, LSL #28\n\t" + /* s[1] <<< 20 */ + "LSR r12, r5, #12\n\t" + "LSR lr, r4, #12\n\t" + "ORR r4, r12, r4, LSL #20\n\t" + "ORR r5, lr, r5, LSL #20\n\t" + /* s[2] <<< 3 */ + "LSR r12, r7, #29\n\t" + "LSR lr, r6, #29\n\t" + "ORR r6, r12, r6, LSL #3\n\t" + "ORR r7, lr, r7, LSL #3\n\t" + /* s[3] <<< 45 */ + "MOV lr, r8\n\t" + "LSR r12, r9, #19\n\t" + "LSR r8, r8, #19\n\t" + "ORR r8, r8, r9, LSL #13\n\t" + "ORR r9, r12, lr, LSL #13\n\t" + /* s[4] <<< 61 */ + "MOV lr, r10\n\t" + "LSR r12, r11, #3\n\t" + "LSR r10, r10, #3\n\t" + "ORR r10, r10, r11, LSL #29\n\t" + "ORR r11, r12, lr, LSL #29\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [%[state], #48]\n\t" + "STR lr, [%[state], #52]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [%[state], #56]\n\t" + "STR lr, [%[state], #60]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [%[state], #64]\n\t" + "STR lr, [%[state], #68]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [%[state], #72]\n\t" + "STR lr, [%[state], #76]\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STR r12, [%[state], #40]\n\t" + "STR lr, [%[state], #44]\n\t" + /* Row 2 */ + "LDRD r2, r3, [sp, #8]\n\t" + "LDRD r4, r5, [sp, #56]\n\t" + "LDRD r6, r7, [sp, #104]\n\t" + "LDRD r8, r9, [sp, #152]\n\t" + "LDRD r10, r11, [sp, #160]\n\t" + /* s[0] <<< 1 */ + "LSR r12, r3, #31\n\t" + "LSR lr, r2, #31\n\t" + "ORR r2, r12, r2, LSL #1\n\t" + "ORR r3, lr, r3, LSL #1\n\t" + /* s[1] <<< 6 */ + "LSR r12, r5, #26\n\t" + "LSR lr, r4, #26\n\t" + "ORR r4, r12, r4, LSL #6\n\t" + "ORR r5, lr, r5, LSL #6\n\t" + /* s[2] <<< 25 */ + "LSR r12, r7, #7\n\t" + "LSR lr, r6, #7\n\t" + "ORR r6, r12, r6, LSL #25\n\t" + "ORR r7, lr, r7, LSL #25\n\t" + /* s[3] <<< 8 */ + "LSR r12, r9, #24\n\t" + "LSR lr, r8, #24\n\t" + "ORR r8, r12, r8, LSL #8\n\t" + "ORR r9, lr, r9, LSL #8\n\t" + /* s[4] <<< 18 */ + "LSR r12, r11, #14\n\t" + "LSR lr, r10, #14\n\t" + "ORR r10, r12, r10, LSL #18\n\t" + "ORR r11, lr, r11, LSL #18\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [%[state], #88]\n\t" + "STR lr, [%[state], #92]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [%[state], #96]\n\t" + "STR lr, [%[state], #100]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [%[state], #104]\n\t" + "STR lr, [%[state], #108]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [%[state], #112]\n\t" + "STR lr, [%[state], #116]\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STR r12, [%[state], #80]\n\t" + "STR lr, [%[state], #84]\n\t" + /* Row 3 */ + "LDRD r2, r3, [sp, #32]\n\t" + "LDRD r4, r5, [sp, #40]\n\t" + "LDRD r6, r7, [sp, #88]\n\t" + "LDRD r8, r9, [sp, #136]\n\t" + "LDRD r10, r11, [sp, #184]\n\t" + /* s[0] <<< 27 */ + "LSR r12, r3, #5\n\t" + "LSR lr, r2, #5\n\t" + "ORR r2, r12, r2, LSL #27\n\t" + "ORR r3, lr, r3, LSL #27\n\t" + /* s[1] <<< 36 */ + "MOV lr, r4\n\t" + "LSR r12, r5, #28\n\t" + "LSR r4, r4, #28\n\t" + "ORR r4, r4, r5, LSL #4\n\t" + "ORR r5, r12, lr, LSL #4\n\t" + /* s[2] <<< 10 */ + "LSR r12, r7, #22\n\t" + "LSR lr, r6, #22\n\t" + "ORR r6, r12, r6, LSL #10\n\t" + "ORR r7, lr, r7, LSL #10\n\t" + /* s[3] <<< 15 */ + "LSR r12, r9, #17\n\t" + "LSR lr, r8, #17\n\t" + "ORR r8, r12, r8, LSL #15\n\t" + "ORR r9, lr, r9, LSL #15\n\t" + /* s[4] <<< 56 */ + "MOV lr, r10\n\t" + "LSR r12, r11, #8\n\t" + "LSR r10, r10, #8\n\t" + "ORR r10, r10, r11, LSL #24\n\t" + "ORR r11, r12, lr, LSL #24\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [%[state], #128]\n\t" + "STR lr, [%[state], #132]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [%[state], #136]\n\t" + "STR lr, [%[state], #140]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [%[state], #144]\n\t" + "STR lr, [%[state], #148]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [%[state], #152]\n\t" + "STR lr, [%[state], #156]\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STR r12, [%[state], #120]\n\t" + "STR lr, [%[state], #124]\n\t" + /* Row 4 */ + "LDRD r2, r3, [sp, #16]\n\t" + "LDRD r4, r5, [sp, #64]\n\t" + "LDRD r6, r7, [sp, #112]\n\t" + "LDRD r8, r9, [sp, #120]\n\t" + "LDRD r10, r11, [sp, #168]\n\t" + /* s[0] <<< 62 */ + "MOV lr, r2\n\t" + "LSR r12, r3, #2\n\t" + "LSR r2, r2, #2\n\t" + "ORR r2, r2, r3, LSL #30\n\t" + "ORR r3, r12, lr, LSL #30\n\t" + /* s[1] <<< 55 */ + "MOV lr, r4\n\t" + "LSR r12, r5, #9\n\t" + "LSR r4, r4, #9\n\t" + "ORR r4, r4, r5, LSL #23\n\t" + "ORR r5, r12, lr, LSL #23\n\t" + /* s[2] <<< 39 */ + "MOV lr, r6\n\t" + "LSR r12, r7, #25\n\t" + "LSR r6, r6, #25\n\t" + "ORR r6, r6, r7, LSL #7\n\t" + "ORR r7, r12, lr, LSL #7\n\t" + /* s[3] <<< 41 */ + "MOV lr, r8\n\t" + "LSR r12, r9, #23\n\t" + "LSR r8, r8, #23\n\t" + "ORR r8, r8, r9, LSL #9\n\t" + "ORR r9, r12, lr, LSL #9\n\t" + /* s[4] <<< 2 */ + "LSR r12, r11, #30\n\t" + "LSR lr, r10, #30\n\t" + "ORR r10, r12, r10, LSL #2\n\t" + "ORR r11, lr, r11, LSL #2\n\t" + "BIC r12, r8, r6\n\t" + "BIC lr, r9, r7\n\t" + "EOR r12, r12, r4\n\t" + "EOR lr, lr, r5\n\t" + "STR r12, [%[state], #168]\n\t" + "STR lr, [%[state], #172]\n\t" + "BIC r12, r10, r8\n\t" + "BIC lr, r11, r9\n\t" + "EOR r12, r12, r6\n\t" + "EOR lr, lr, r7\n\t" + "STR r12, [%[state], #176]\n\t" + "STR lr, [%[state], #180]\n\t" + "BIC r12, r2, r10\n\t" + "BIC lr, r3, r11\n\t" + "EOR r12, r12, r8\n\t" + "EOR lr, lr, r9\n\t" + "STR r12, [%[state], #184]\n\t" + "STR lr, [%[state], #188]\n\t" + "BIC r12, r4, r2\n\t" + "BIC lr, r5, r3\n\t" + "EOR r12, r12, r10\n\t" + "EOR lr, lr, r11\n\t" + "STR r12, [%[state], #192]\n\t" + "STR lr, [%[state], #196]\n\t" + "BIC r12, r6, r4\n\t" + "BIC lr, r7, r5\n\t" + "EOR r12, r12, r2\n\t" + "EOR lr, lr, r3\n\t" + "STR r12, [%[state], #160]\n\t" + "STR lr, [%[state], #164]\n\t" + "LDR r2, [sp, #200]\n\t" + "SUBS r2, r2, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_sha3_thumb2_begin_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_sha3_thumb2_begin\n\t" +#else + "BNE.W L_sha3_thumb2_begin_%=\n\t" +#endif + "ADD sp, sp, #0xcc\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [state] "+r" (state), + [L_sha3_thumb2_rt] "+r" (L_sha3_thumb2_rt_c) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#else + : [state] "+r" (state) + : [L_sha3_thumb2_rt] "r" (L_sha3_thumb2_rt) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#endif /* WOLFSSL_SHA3 */ +#endif /* WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S new file mode 100644 index 000000000..b3c355411 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S @@ -0,0 +1,3677 @@ +/* thumb2-sha512-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha512-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#ifdef WOLFSSL_ARMASM_THUMB2 +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifdef WOLFSSL_SHA512 +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA512_transform_len_k, %object + .size L_SHA512_transform_len_k, 640 + .align 8 +L_SHA512_transform_len_k: + .word 0xd728ae22 + .word 0x428a2f98 + .word 0x23ef65cd + .word 0x71374491 + .word 0xec4d3b2f + .word 0xb5c0fbcf + .word 0x8189dbbc + .word 0xe9b5dba5 + .word 0xf348b538 + .word 0x3956c25b + .word 0xb605d019 + .word 0x59f111f1 + .word 0xaf194f9b + .word 0x923f82a4 + .word 0xda6d8118 + .word 0xab1c5ed5 + .word 0xa3030242 + .word 0xd807aa98 + .word 0x45706fbe + .word 0x12835b01 + .word 0x4ee4b28c + .word 0x243185be + .word 0xd5ffb4e2 + .word 0x550c7dc3 + .word 0xf27b896f + .word 0x72be5d74 + .word 0x3b1696b1 + .word 0x80deb1fe + .word 0x25c71235 + .word 0x9bdc06a7 + .word 0xcf692694 + .word 0xc19bf174 + .word 0x9ef14ad2 + .word 0xe49b69c1 + .word 0x384f25e3 + .word 0xefbe4786 + .word 0x8b8cd5b5 + .word 0xfc19dc6 + .word 0x77ac9c65 + .word 0x240ca1cc + .word 0x592b0275 + .word 0x2de92c6f + .word 0x6ea6e483 + .word 0x4a7484aa + .word 0xbd41fbd4 + .word 0x5cb0a9dc + .word 0x831153b5 + .word 0x76f988da + .word 0xee66dfab + .word 0x983e5152 + .word 0x2db43210 + .word 0xa831c66d + .word 0x98fb213f + .word 0xb00327c8 + .word 0xbeef0ee4 + .word 0xbf597fc7 + .word 0x3da88fc2 + .word 0xc6e00bf3 + .word 0x930aa725 + .word 0xd5a79147 + .word 0xe003826f + .word 0x6ca6351 + .word 0xa0e6e70 + .word 0x14292967 + .word 0x46d22ffc + .word 0x27b70a85 + .word 0x5c26c926 + .word 0x2e1b2138 + .word 0x5ac42aed + .word 0x4d2c6dfc + .word 0x9d95b3df + .word 0x53380d13 + .word 0x8baf63de + .word 0x650a7354 + .word 0x3c77b2a8 + .word 0x766a0abb + .word 0x47edaee6 + .word 0x81c2c92e + .word 0x1482353b + .word 0x92722c85 + .word 0x4cf10364 + .word 0xa2bfe8a1 + .word 0xbc423001 + .word 0xa81a664b + .word 0xd0f89791 + .word 0xc24b8b70 + .word 0x654be30 + .word 0xc76c51a3 + .word 0xd6ef5218 + .word 0xd192e819 + .word 0x5565a910 + .word 0xd6990624 + .word 0x5771202a + .word 0xf40e3585 + .word 0x32bbd1b8 + .word 0x106aa070 + .word 0xb8d2d0c8 + .word 0x19a4c116 + .word 0x5141ab53 + .word 0x1e376c08 + .word 0xdf8eeb99 + .word 0x2748774c + .word 0xe19b48a8 + .word 0x34b0bcb5 + .word 0xc5c95a63 + .word 0x391c0cb3 + .word 0xe3418acb + .word 0x4ed8aa4a + .word 0x7763e373 + .word 0x5b9cca4f + .word 0xd6b2b8a3 + .word 0x682e6ff3 + .word 0x5defb2fc + .word 0x748f82ee + .word 0x43172f60 + .word 0x78a5636f + .word 0xa1f0ab72 + .word 0x84c87814 + .word 0x1a6439ec + .word 0x8cc70208 + .word 0x23631e28 + .word 0x90befffa + .word 0xde82bde9 + .word 0xa4506ceb + .word 0xb2c67915 + .word 0xbef9a3f7 + .word 0xe372532b + .word 0xc67178f2 + .word 0xea26619c + .word 0xca273ece + .word 0x21c0c207 + .word 0xd186b8c7 + .word 0xcde0eb1e + .word 0xeada7dd6 + .word 0xee6ed178 + .word 0xf57d4f7f + .word 0x72176fba + .word 0x6f067aa + .word 0xa2c898a6 + .word 0xa637dc5 + .word 0xbef90dae + .word 0x113f9804 + .word 0x131c471b + .word 0x1b710b35 + .word 0x23047d84 + .word 0x28db77f5 + .word 0x40c72493 + .word 0x32caab7b + .word 0x15c9bebc + .word 0x3c9ebe0a + .word 0x9c100d4c + .word 0x431d67c4 + .word 0xcb3e42b6 + .word 0x4cc5d4be + .word 0xfc657e2a + .word 0x597f299c + .word 0x3ad6faec + .word 0x5fcb6fab + .word 0x4a475817 + .word 0x6c44198c + .text + .align 4 + .globl Transform_Sha512_Len + .type Transform_Sha512_Len, %function +Transform_Sha512_Len: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xc0 + ADR r3, L_SHA512_transform_len_k + /* Copy digest to add in at end */ + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + LDRD r10, r11, [r0, #24] + STRD r4, r5, [sp, #128] + STRD r6, r7, [sp, #136] + STRD r8, r9, [sp, #144] + STRD r10, r11, [sp, #152] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + LDRD r10, r11, [r0, #56] + STRD r4, r5, [sp, #160] + STRD r6, r7, [sp, #168] + STRD r8, r9, [sp, #176] + STRD r10, r11, [sp, #184] + /* Start of loop processing a block */ +L_SHA512_transform_len_begin: + /* Load, Reverse and Store W */ + LDR r4, [r1] + LDR r5, [r1, #4] + LDR r6, [r1, #8] + LDR r7, [r1, #12] + LDR r8, [r1, #16] + LDR r9, [r1, #20] + LDR r10, [r1, #24] + LDR r11, [r1, #28] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STR r5, [sp] + STR r4, [sp, #4] + STR r7, [sp, #8] + STR r6, [sp, #12] + STR r9, [sp, #16] + STR r8, [sp, #20] + STR r11, [sp, #24] + STR r10, [sp, #28] + LDR r4, [r1, #32] + LDR r5, [r1, #36] + LDR r6, [r1, #40] + LDR r7, [r1, #44] + LDR r8, [r1, #48] + LDR r9, [r1, #52] + LDR r10, [r1, #56] + LDR r11, [r1, #60] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STR r5, [sp, #32] + STR r4, [sp, #36] + STR r7, [sp, #40] + STR r6, [sp, #44] + STR r9, [sp, #48] + STR r8, [sp, #52] + STR r11, [sp, #56] + STR r10, [sp, #60] + LDR r4, [r1, #64] + LDR r5, [r1, #68] + LDR r6, [r1, #72] + LDR r7, [r1, #76] + LDR r8, [r1, #80] + LDR r9, [r1, #84] + LDR r10, [r1, #88] + LDR r11, [r1, #92] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STR r5, [sp, #64] + STR r4, [sp, #68] + STR r7, [sp, #72] + STR r6, [sp, #76] + STR r9, [sp, #80] + STR r8, [sp, #84] + STR r11, [sp, #88] + STR r10, [sp, #92] + LDR r4, [r1, #96] + LDR r5, [r1, #100] + LDR r6, [r1, #104] + LDR r7, [r1, #108] + LDR r8, [r1, #112] + LDR r9, [r1, #116] + LDR r10, [r1, #120] + LDR r11, [r1, #124] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STR r5, [sp, #96] + STR r4, [sp, #100] + STR r7, [sp, #104] + STR r6, [sp, #108] + STR r9, [sp, #112] + STR r8, [sp, #116] + STR r11, [sp, #120] + STR r10, [sp, #124] + /* Pre-calc: b ^ c */ + LDRD r10, r11, [r0, #8] + LDRD r4, r5, [r0, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + MOV r12, #0x4 + /* Start of 16 rounds */ +L_SHA512_transform_len_start: + /* Round 0 */ + LDRD r4, r5, [r0, #32] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #56] + LDRD r8, r9, [sp] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0] + STRD r8, r9, [r0, #24] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0] + LDRD r6, r7, [r0, #8] + STRD r4, r5, [r0, #56] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #56] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #56] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[0] */ + LDRD r4, r5, [sp, #112] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp] + LDRD r8, r9, [sp, #72] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp] + LDRD r4, r5, [sp, #8] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp] + /* Round 1 */ + LDRD r4, r5, [r0, #24] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #32] + LDRD r8, r9, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #48] + LDRD r8, r9, [sp, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #8] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #56] + STRD r8, r9, [r0, #16] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #56] + LDRD r6, r7, [r0] + STRD r4, r5, [r0, #48] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #48] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #48] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[1] */ + LDRD r4, r5, [sp, #120] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #8] + LDRD r8, r9, [sp, #80] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #8] + LDRD r4, r5, [sp, #16] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #8] + /* Round 2 */ + LDRD r4, r5, [r0, #16] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #40] + LDRD r8, r9, [sp, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #16] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #48] + STRD r8, r9, [r0, #8] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #48] + LDRD r6, r7, [r0, #56] + STRD r4, r5, [r0, #40] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #40] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #40] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[2] */ + LDRD r4, r5, [sp] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #16] + LDRD r8, r9, [sp, #88] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #16] + LDRD r4, r5, [sp, #24] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #16] + /* Round 3 */ + LDRD r4, r5, [r0, #8] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #16] + LDRD r8, r9, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #32] + LDRD r8, r9, [sp, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #24] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #40] + STRD r8, r9, [r0] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #40] + LDRD r6, r7, [r0, #48] + STRD r4, r5, [r0, #32] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #32] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #32] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[3] */ + LDRD r4, r5, [sp, #8] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #24] + LDRD r8, r9, [sp, #96] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #24] + LDRD r4, r5, [sp, #32] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #24] + /* Round 4 */ + LDRD r4, r5, [r0] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #24] + LDRD r8, r9, [sp, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #32] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #32] + STRD r8, r9, [r0, #56] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #32] + LDRD r6, r7, [r0, #40] + STRD r4, r5, [r0, #24] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #24] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #24] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[4] */ + LDRD r4, r5, [sp, #16] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #32] + LDRD r8, r9, [sp, #104] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #32] + LDRD r4, r5, [sp, #40] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #32] + /* Round 5 */ + LDRD r4, r5, [r0, #56] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0] + LDRD r8, r9, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #16] + LDRD r8, r9, [sp, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #40] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #24] + STRD r8, r9, [r0, #48] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #24] + LDRD r6, r7, [r0, #32] + STRD r4, r5, [r0, #16] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #16] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #16] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[5] */ + LDRD r4, r5, [sp, #24] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #40] + LDRD r8, r9, [sp, #112] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #40] + LDRD r4, r5, [sp, #48] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #40] + /* Round 6 */ + LDRD r4, r5, [r0, #48] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #8] + LDRD r8, r9, [sp, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #48] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #16] + STRD r8, r9, [r0, #40] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #16] + LDRD r6, r7, [r0, #24] + STRD r4, r5, [r0, #8] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #8] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #8] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[6] */ + LDRD r4, r5, [sp, #32] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #48] + LDRD r8, r9, [sp, #120] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #48] + LDRD r4, r5, [sp, #56] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #48] + /* Round 7 */ + LDRD r4, r5, [r0, #40] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0] + LDRD r8, r9, [sp, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #56] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #8] + STRD r8, r9, [r0, #32] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #8] + LDRD r6, r7, [r0, #16] + STRD r4, r5, [r0] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[7] */ + LDRD r4, r5, [sp, #40] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #56] + LDRD r8, r9, [sp] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #56] + LDRD r4, r5, [sp, #64] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #56] + /* Round 8 */ + LDRD r4, r5, [r0, #32] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #56] + LDRD r8, r9, [sp, #64] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #64] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0] + STRD r8, r9, [r0, #24] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0] + LDRD r6, r7, [r0, #8] + STRD r4, r5, [r0, #56] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #56] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #56] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[8] */ + LDRD r4, r5, [sp, #48] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #64] + LDRD r8, r9, [sp, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #64] + LDRD r4, r5, [sp, #72] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #64] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #64] + /* Round 9 */ + LDRD r4, r5, [r0, #24] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #32] + LDRD r8, r9, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #48] + LDRD r8, r9, [sp, #72] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #72] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #56] + STRD r8, r9, [r0, #16] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #56] + LDRD r6, r7, [r0] + STRD r4, r5, [r0, #48] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #48] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #48] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[9] */ + LDRD r4, r5, [sp, #56] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #72] + LDRD r8, r9, [sp, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #72] + LDRD r4, r5, [sp, #80] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #72] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #72] + /* Round 10 */ + LDRD r4, r5, [r0, #16] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #40] + LDRD r8, r9, [sp, #80] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #80] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #48] + STRD r8, r9, [r0, #8] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #48] + LDRD r6, r7, [r0, #56] + STRD r4, r5, [r0, #40] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #40] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #40] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[10] */ + LDRD r4, r5, [sp, #64] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #80] + LDRD r8, r9, [sp, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #80] + LDRD r4, r5, [sp, #88] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #80] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #80] + /* Round 11 */ + LDRD r4, r5, [r0, #8] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #16] + LDRD r8, r9, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #32] + LDRD r8, r9, [sp, #88] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #88] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #40] + STRD r8, r9, [r0] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #40] + LDRD r6, r7, [r0, #48] + STRD r4, r5, [r0, #32] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #32] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #32] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[11] */ + LDRD r4, r5, [sp, #72] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #88] + LDRD r8, r9, [sp, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #88] + LDRD r4, r5, [sp, #96] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #88] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #88] + /* Round 12 */ + LDRD r4, r5, [r0] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #24] + LDRD r8, r9, [sp, #96] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #96] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #32] + STRD r8, r9, [r0, #56] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #32] + LDRD r6, r7, [r0, #40] + STRD r4, r5, [r0, #24] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #24] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #24] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[12] */ + LDRD r4, r5, [sp, #80] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #96] + LDRD r8, r9, [sp, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #96] + LDRD r4, r5, [sp, #104] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #96] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #96] + /* Round 13 */ + LDRD r4, r5, [r0, #56] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0] + LDRD r8, r9, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #16] + LDRD r8, r9, [sp, #104] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #104] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #24] + STRD r8, r9, [r0, #48] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #24] + LDRD r6, r7, [r0, #32] + STRD r4, r5, [r0, #16] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #16] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #16] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[13] */ + LDRD r4, r5, [sp, #88] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #104] + LDRD r8, r9, [sp, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #104] + LDRD r4, r5, [sp, #112] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #104] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #104] + /* Round 14 */ + LDRD r4, r5, [r0, #48] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #8] + LDRD r8, r9, [sp, #112] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #112] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #16] + STRD r8, r9, [r0, #40] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #16] + LDRD r6, r7, [r0, #24] + STRD r4, r5, [r0, #8] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #8] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #8] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[14] */ + LDRD r4, r5, [sp, #96] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #112] + LDRD r8, r9, [sp, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #112] + LDRD r4, r5, [sp, #120] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #112] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #112] + /* Round 15 */ + LDRD r4, r5, [r0, #40] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0] + LDRD r8, r9, [sp, #120] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #120] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #8] + STRD r8, r9, [r0, #32] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #8] + LDRD r6, r7, [r0, #16] + STRD r4, r5, [r0] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0] + MOV r10, r8 + MOV r11, r9 + /* Calc new W[15] */ + LDRD r4, r5, [sp, #104] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #120] + LDRD r8, r9, [sp, #64] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #120] + LDRD r4, r5, [sp] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #120] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #120] + ADD r3, r3, #0x80 + SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_SHA512_transform_len_start +#else + BNE.W L_SHA512_transform_len_start +#endif + /* Round 0 */ + LDRD r4, r5, [r0, #32] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #56] + LDRD r8, r9, [sp] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0] + STRD r8, r9, [r0, #24] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0] + LDRD r6, r7, [r0, #8] + STRD r4, r5, [r0, #56] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #56] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #56] + MOV r10, r8 + MOV r11, r9 + /* Round 1 */ + LDRD r4, r5, [r0, #24] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #32] + LDRD r8, r9, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #48] + LDRD r8, r9, [sp, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #8] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #56] + STRD r8, r9, [r0, #16] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #56] + LDRD r6, r7, [r0] + STRD r4, r5, [r0, #48] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #48] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #48] + MOV r10, r8 + MOV r11, r9 + /* Round 2 */ + LDRD r4, r5, [r0, #16] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #40] + LDRD r8, r9, [sp, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #16] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #48] + STRD r8, r9, [r0, #8] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #48] + LDRD r6, r7, [r0, #56] + STRD r4, r5, [r0, #40] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #40] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #40] + MOV r10, r8 + MOV r11, r9 + /* Round 3 */ + LDRD r4, r5, [r0, #8] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #16] + LDRD r8, r9, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #32] + LDRD r8, r9, [sp, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #24] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #40] + STRD r8, r9, [r0] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #40] + LDRD r6, r7, [r0, #48] + STRD r4, r5, [r0, #32] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #32] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #32] + MOV r10, r8 + MOV r11, r9 + /* Round 4 */ + LDRD r4, r5, [r0] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #24] + LDRD r8, r9, [sp, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #32] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #32] + STRD r8, r9, [r0, #56] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #32] + LDRD r6, r7, [r0, #40] + STRD r4, r5, [r0, #24] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #24] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #24] + MOV r10, r8 + MOV r11, r9 + /* Round 5 */ + LDRD r4, r5, [r0, #56] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0] + LDRD r8, r9, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #16] + LDRD r8, r9, [sp, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #40] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #24] + STRD r8, r9, [r0, #48] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #24] + LDRD r6, r7, [r0, #32] + STRD r4, r5, [r0, #16] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #16] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #16] + MOV r10, r8 + MOV r11, r9 + /* Round 6 */ + LDRD r4, r5, [r0, #48] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #8] + LDRD r8, r9, [sp, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #48] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #16] + STRD r8, r9, [r0, #40] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #16] + LDRD r6, r7, [r0, #24] + STRD r4, r5, [r0, #8] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #8] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #8] + MOV r10, r8 + MOV r11, r9 + /* Round 7 */ + LDRD r4, r5, [r0, #40] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0] + LDRD r8, r9, [sp, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #56] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #8] + STRD r8, r9, [r0, #32] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #8] + LDRD r6, r7, [r0, #16] + STRD r4, r5, [r0] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0] + MOV r10, r8 + MOV r11, r9 + /* Round 8 */ + LDRD r4, r5, [r0, #32] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #56] + LDRD r8, r9, [sp, #64] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #64] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0] + STRD r8, r9, [r0, #24] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0] + LDRD r6, r7, [r0, #8] + STRD r4, r5, [r0, #56] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #56] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #56] + MOV r10, r8 + MOV r11, r9 + /* Round 9 */ + LDRD r4, r5, [r0, #24] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #32] + LDRD r8, r9, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #48] + LDRD r8, r9, [sp, #72] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #72] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #56] + STRD r8, r9, [r0, #16] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #56] + LDRD r6, r7, [r0] + STRD r4, r5, [r0, #48] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #48] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #48] + MOV r10, r8 + MOV r11, r9 + /* Round 10 */ + LDRD r4, r5, [r0, #16] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #40] + LDRD r8, r9, [sp, #80] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #80] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #48] + STRD r8, r9, [r0, #8] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #48] + LDRD r6, r7, [r0, #56] + STRD r4, r5, [r0, #40] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #40] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #40] + MOV r10, r8 + MOV r11, r9 + /* Round 11 */ + LDRD r4, r5, [r0, #8] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #16] + LDRD r8, r9, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #32] + LDRD r8, r9, [sp, #88] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #88] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #40] + STRD r8, r9, [r0] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #40] + LDRD r6, r7, [r0, #48] + STRD r4, r5, [r0, #32] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #32] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #32] + MOV r10, r8 + MOV r11, r9 + /* Round 12 */ + LDRD r4, r5, [r0] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #24] + LDRD r8, r9, [sp, #96] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #96] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #32] + STRD r8, r9, [r0, #56] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #32] + LDRD r6, r7, [r0, #40] + STRD r4, r5, [r0, #24] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #24] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #24] + MOV r10, r8 + MOV r11, r9 + /* Round 13 */ + LDRD r4, r5, [r0, #56] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0] + LDRD r8, r9, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #16] + LDRD r8, r9, [sp, #104] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #104] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #24] + STRD r8, r9, [r0, #48] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #24] + LDRD r6, r7, [r0, #32] + STRD r4, r5, [r0, #16] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #16] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #16] + MOV r10, r8 + MOV r11, r9 + /* Round 14 */ + LDRD r4, r5, [r0, #48] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #8] + LDRD r8, r9, [sp, #112] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #112] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #16] + STRD r8, r9, [r0, #40] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #16] + LDRD r6, r7, [r0, #24] + STRD r4, r5, [r0, #8] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #8] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #8] + MOV r10, r8 + MOV r11, r9 + /* Round 15 */ + LDRD r4, r5, [r0, #40] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0] + LDRD r8, r9, [sp, #120] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #120] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #8] + STRD r8, r9, [r0, #32] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #8] + LDRD r6, r7, [r0, #16] + STRD r4, r5, [r0] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0] + MOV r10, r8 + MOV r11, r9 + /* Add in digest from start */ + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [sp, #128] + LDRD r10, r11, [sp, #136] + ADDS r4, r4, r8 + ADC r5, r5, r9 + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r4, r5, [r0] + STRD r6, r7, [r0, #8] + STRD r4, r5, [sp, #128] + STRD r6, r7, [sp, #136] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [sp, #144] + LDRD r10, r11, [sp, #152] + ADDS r4, r4, r8 + ADC r5, r5, r9 + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r4, r5, [r0, #16] + STRD r6, r7, [r0, #24] + STRD r4, r5, [sp, #144] + STRD r6, r7, [sp, #152] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [sp, #160] + LDRD r10, r11, [sp, #168] + ADDS r4, r4, r8 + ADC r5, r5, r9 + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r4, r5, [r0, #32] + STRD r6, r7, [r0, #40] + STRD r4, r5, [sp, #160] + STRD r6, r7, [sp, #168] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [sp, #176] + LDRD r10, r11, [sp, #184] + ADDS r4, r4, r8 + ADC r5, r5, r9 + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r4, r5, [r0, #48] + STRD r6, r7, [r0, #56] + STRD r4, r5, [sp, #176] + STRD r6, r7, [sp, #184] + SUBS r2, r2, #0x80 + SUB r3, r3, #0x200 + ADD r1, r1, #0x80 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_SHA512_transform_len_begin +#else + BNE.W L_SHA512_transform_len_begin +#endif + EOR r0, r0, r0 + ADD sp, sp, #0xc0 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 5021 */ + .size Transform_Sha512_Len,.-Transform_Sha512_Len +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA512 */ +#endif /* WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c new file mode 100644 index 000000000..6a223b19a --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c @@ -0,0 +1,3595 @@ +/* thumb2-sha512-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha512-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#ifdef WOLFSSL_ARMASM_THUMB2 +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#ifdef WOLFSSL_SHA512 +#include + +#ifdef WOLFSSL_ARMASM_NO_NEON +static const word64 L_SHA512_transform_len_k[] = { + 0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL, + 0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL, + 0x06ca6351e003826fUL, 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, 0xd69906245565a910UL, + 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL, + 0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL, + 0x113f9804bef90daeUL, 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL, +}; + +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) +#else +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register wc_Sha512* sha512 __asm__ ("r0") = (wc_Sha512*)sha512_p; + register const byte* data __asm__ ("r1") = (const byte*)data_p; + register word32 len __asm__ ("r2") = (word32)len_p; + register word64* L_SHA512_transform_len_k_c __asm__ ("r3") = (word64*)&L_SHA512_transform_len_k; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "SUB sp, sp, #0xc0\n\t" + "MOV r3, %[L_SHA512_transform_len_k]\n\t" + /* Copy digest to add in at end */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r10, r11, [%[sha512], #24]\n\t" + "STRD r4, r5, [sp, #128]\n\t" + "STRD r6, r7, [sp, #136]\n\t" + "STRD r8, r9, [sp, #144]\n\t" + "STRD r10, r11, [sp, #152]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r10, r11, [%[sha512], #56]\n\t" + "STRD r4, r5, [sp, #160]\n\t" + "STRD r6, r7, [sp, #168]\n\t" + "STRD r8, r9, [sp, #176]\n\t" + "STRD r10, r11, [sp, #184]\n\t" + /* Start of loop processing a block */ + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_SHA512_transform_len_begin:\n\t" +#else + "L_SHA512_transform_len_begin_%=:\n\t" +#endif + /* Load, Reverse and Store W */ + "LDR r4, [%[data]]\n\t" + "LDR r5, [%[data], #4]\n\t" + "LDR r6, [%[data], #8]\n\t" + "LDR r7, [%[data], #12]\n\t" + "LDR r8, [%[data], #16]\n\t" + "LDR r9, [%[data], #20]\n\t" + "LDR r10, [%[data], #24]\n\t" + "LDR r11, [%[data], #28]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STR r5, [sp]\n\t" + "STR r4, [sp, #4]\n\t" + "STR r7, [sp, #8]\n\t" + "STR r6, [sp, #12]\n\t" + "STR r9, [sp, #16]\n\t" + "STR r8, [sp, #20]\n\t" + "STR r11, [sp, #24]\n\t" + "STR r10, [sp, #28]\n\t" + "LDR r4, [%[data], #32]\n\t" + "LDR r5, [%[data], #36]\n\t" + "LDR r6, [%[data], #40]\n\t" + "LDR r7, [%[data], #44]\n\t" + "LDR r8, [%[data], #48]\n\t" + "LDR r9, [%[data], #52]\n\t" + "LDR r10, [%[data], #56]\n\t" + "LDR r11, [%[data], #60]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STR r5, [sp, #32]\n\t" + "STR r4, [sp, #36]\n\t" + "STR r7, [sp, #40]\n\t" + "STR r6, [sp, #44]\n\t" + "STR r9, [sp, #48]\n\t" + "STR r8, [sp, #52]\n\t" + "STR r11, [sp, #56]\n\t" + "STR r10, [sp, #60]\n\t" + "LDR r4, [%[data], #64]\n\t" + "LDR r5, [%[data], #68]\n\t" + "LDR r6, [%[data], #72]\n\t" + "LDR r7, [%[data], #76]\n\t" + "LDR r8, [%[data], #80]\n\t" + "LDR r9, [%[data], #84]\n\t" + "LDR r10, [%[data], #88]\n\t" + "LDR r11, [%[data], #92]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STR r5, [sp, #64]\n\t" + "STR r4, [sp, #68]\n\t" + "STR r7, [sp, #72]\n\t" + "STR r6, [sp, #76]\n\t" + "STR r9, [sp, #80]\n\t" + "STR r8, [sp, #84]\n\t" + "STR r11, [sp, #88]\n\t" + "STR r10, [sp, #92]\n\t" + "LDR r4, [%[data], #96]\n\t" + "LDR r5, [%[data], #100]\n\t" + "LDR r6, [%[data], #104]\n\t" + "LDR r7, [%[data], #108]\n\t" + "LDR r8, [%[data], #112]\n\t" + "LDR r9, [%[data], #116]\n\t" + "LDR r10, [%[data], #120]\n\t" + "LDR r11, [%[data], #124]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STR r5, [sp, #96]\n\t" + "STR r4, [sp, #100]\n\t" + "STR r7, [sp, #104]\n\t" + "STR r6, [sp, #108]\n\t" + "STR r9, [sp, #112]\n\t" + "STR r8, [sp, #116]\n\t" + "STR r11, [sp, #120]\n\t" + "STR r10, [sp, #124]\n\t" + /* Pre-calc: b ^ c */ + "LDRD r10, r11, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "MOV r12, #0x4\n\t" + /* Start of 16 rounds */ + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_SHA512_transform_len_start:\n\t" +#else + "L_SHA512_transform_len_start_%=:\n\t" +#endif + /* Round 0 */ + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "STRD r8, r9, [%[sha512], #24]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[0] */ + "LDRD r4, r5, [sp, #112]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp]\n\t" + "LDRD r8, r9, [sp, #72]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp]\n\t" + "LDRD r4, r5, [sp, #8]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp]\n\t" + /* Round 1 */ + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r8, r9, [sp, #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #8]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "STRD r8, r9, [%[sha512], #16]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #48]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[1] */ + "LDRD r4, r5, [sp, #120]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #8]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #8]\n\t" + "LDRD r4, r5, [sp, #16]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #8]\n\t" + /* Round 2 */ + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #16]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "STRD r8, r9, [%[sha512], #8]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[2] */ + "LDRD r4, r5, [sp]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #16]\n\t" + "LDRD r8, r9, [sp, #88]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #16]\n\t" + "LDRD r4, r5, [sp, #24]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #16]\n\t" + /* Round 3 */ + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r8, r9, [sp, #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #24]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "STRD r8, r9, [%[sha512]]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #32]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[3] */ + "LDRD r4, r5, [sp, #8]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #24]\n\t" + "LDRD r8, r9, [sp, #96]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #24]\n\t" + "LDRD r4, r5, [sp, #32]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #24]\n\t" + /* Round 4 */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #32]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "STRD r8, r9, [%[sha512], #56]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[4] */ + "LDRD r4, r5, [sp, #16]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #32]\n\t" + "LDRD r8, r9, [sp, #104]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #32]\n\t" + "LDRD r4, r5, [sp, #40]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #32]\n\t" + /* Round 5 */ + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r8, r9, [sp, #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #40]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "STRD r8, r9, [%[sha512], #48]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #16]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[5] */ + "LDRD r4, r5, [sp, #24]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #40]\n\t" + "LDRD r8, r9, [sp, #112]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #40]\n\t" + "LDRD r4, r5, [sp, #48]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #40]\n\t" + /* Round 6 */ + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #48]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "STRD r8, r9, [%[sha512], #40]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[6] */ + "LDRD r4, r5, [sp, #32]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #48]\n\t" + "LDRD r8, r9, [sp, #120]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #48]\n\t" + "LDRD r4, r5, [sp, #56]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #48]\n\t" + /* Round 7 */ + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r8, r9, [sp, #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #56]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "STRD r8, r9, [%[sha512], #32]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512]]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[7] */ + "LDRD r4, r5, [sp, #40]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #56]\n\t" + "LDRD r8, r9, [sp]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #56]\n\t" + "LDRD r4, r5, [sp, #64]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #56]\n\t" + /* Round 8 */ + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp, #64]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #64]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "STRD r8, r9, [%[sha512], #24]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[8] */ + "LDRD r4, r5, [sp, #48]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #64]\n\t" + "LDRD r8, r9, [sp, #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #64]\n\t" + "LDRD r4, r5, [sp, #72]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #64]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #64]\n\t" + /* Round 9 */ + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r8, r9, [sp, #72]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #72]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "STRD r8, r9, [%[sha512], #16]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #48]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[9] */ + "LDRD r4, r5, [sp, #56]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #72]\n\t" + "LDRD r8, r9, [sp, #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #72]\n\t" + "LDRD r4, r5, [sp, #80]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #72]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #72]\n\t" + /* Round 10 */ + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #80]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "STRD r8, r9, [%[sha512], #8]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[10] */ + "LDRD r4, r5, [sp, #64]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #80]\n\t" + "LDRD r8, r9, [sp, #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #80]\n\t" + "LDRD r4, r5, [sp, #88]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #80]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #80]\n\t" + /* Round 11 */ + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r8, r9, [sp, #88]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #88]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "STRD r8, r9, [%[sha512]]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #32]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[11] */ + "LDRD r4, r5, [sp, #72]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #88]\n\t" + "LDRD r8, r9, [sp, #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #88]\n\t" + "LDRD r4, r5, [sp, #96]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #88]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #88]\n\t" + /* Round 12 */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #96]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #96]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "STRD r8, r9, [%[sha512], #56]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[12] */ + "LDRD r4, r5, [sp, #80]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #96]\n\t" + "LDRD r8, r9, [sp, #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #96]\n\t" + "LDRD r4, r5, [sp, #104]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #96]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #96]\n\t" + /* Round 13 */ + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r8, r9, [sp, #104]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #104]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "STRD r8, r9, [%[sha512], #48]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #16]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[13] */ + "LDRD r4, r5, [sp, #88]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #104]\n\t" + "LDRD r8, r9, [sp, #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #104]\n\t" + "LDRD r4, r5, [sp, #112]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #104]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #104]\n\t" + /* Round 14 */ + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #112]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #112]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "STRD r8, r9, [%[sha512], #40]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[14] */ + "LDRD r4, r5, [sp, #96]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #112]\n\t" + "LDRD r8, r9, [sp, #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #112]\n\t" + "LDRD r4, r5, [sp, #120]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #112]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #112]\n\t" + /* Round 15 */ + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r8, r9, [sp, #120]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #120]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "STRD r8, r9, [%[sha512], #32]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512]]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[15] */ + "LDRD r4, r5, [sp, #104]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #120]\n\t" + "LDRD r8, r9, [sp, #64]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #120]\n\t" + "LDRD r4, r5, [sp]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #120]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #120]\n\t" + "ADD r3, r3, #0x80\n\t" + "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) + "BNE L_SHA512_transform_len_start_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_SHA512_transform_len_start\n\t" +#else + "BNE.W L_SHA512_transform_len_start_%=\n\t" +#endif + /* Round 0 */ + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "STRD r8, r9, [%[sha512], #24]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 1 */ + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r8, r9, [sp, #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #8]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "STRD r8, r9, [%[sha512], #16]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #48]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 2 */ + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #16]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "STRD r8, r9, [%[sha512], #8]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 3 */ + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r8, r9, [sp, #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #24]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "STRD r8, r9, [%[sha512]]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #32]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 4 */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #32]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "STRD r8, r9, [%[sha512], #56]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 5 */ + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r8, r9, [sp, #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #40]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "STRD r8, r9, [%[sha512], #48]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #16]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 6 */ + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #48]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "STRD r8, r9, [%[sha512], #40]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 7 */ + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r8, r9, [sp, #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #56]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "STRD r8, r9, [%[sha512], #32]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512]]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 8 */ + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp, #64]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #64]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "STRD r8, r9, [%[sha512], #24]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 9 */ + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r8, r9, [sp, #72]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #72]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "STRD r8, r9, [%[sha512], #16]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #48]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 10 */ + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #80]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "STRD r8, r9, [%[sha512], #8]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 11 */ + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r8, r9, [sp, #88]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #88]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "STRD r8, r9, [%[sha512]]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #32]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 12 */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #96]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #96]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "STRD r8, r9, [%[sha512], #56]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 13 */ + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r8, r9, [sp, #104]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #104]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "STRD r8, r9, [%[sha512], #48]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #16]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 14 */ + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #112]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #112]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "STRD r8, r9, [%[sha512], #40]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 15 */ + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r8, r9, [sp, #120]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #120]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "STRD r8, r9, [%[sha512], #32]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512]]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Add in digest from start */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #128]\n\t" + "LDRD r10, r11, [sp, #136]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [sp, #128]\n\t" + "STRD r6, r7, [sp, #136]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #144]\n\t" + "LDRD r10, r11, [sp, #152]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [sp, #144]\n\t" + "STRD r6, r7, [sp, #152]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #160]\n\t" + "LDRD r10, r11, [sp, #168]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [sp, #160]\n\t" + "STRD r6, r7, [sp, #168]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp, #176]\n\t" + "LDRD r10, r11, [sp, #184]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [sp, #176]\n\t" + "STRD r6, r7, [sp, #184]\n\t" + "SUBS %[len], %[len], #0x80\n\t" + "SUB r3, r3, #0x200\n\t" + "ADD %[data], %[data], #0x80\n\t" +#if defined(__GNUC__) + "BNE L_SHA512_transform_len_begin_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.W L_SHA512_transform_len_begin\n\t" +#else + "BNE.W L_SHA512_transform_len_begin_%=\n\t" +#endif + "EOR r0, r0, r0\n\t" + "ADD sp, sp, #0xc0\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), + [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc" +#else + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k) + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA512 */ +#endif /* WOLFSSL_ARMASM_THUMB2 */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/riscv/riscv-64-sha256.c b/wolfcrypt/src/port/riscv/riscv-64-sha256.c new file mode 100644 index 000000000..099e56744 --- /dev/null +++ b/wolfcrypt/src/port/riscv/riscv-64-sha256.c @@ -0,0 +1,1417 @@ +/* riscv-sha256.c + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#ifdef WOLFSSL_RISCV_ASM +#if !defined(NO_SHA256) || defined(WOLFSSL_SHA224) + +#if FIPS_VERSION3_LT(6,0,0) && defined(HAVE_FIPS) + #undef HAVE_FIPS +#else + #if defined(HAVE_FIPS) && FIPS_VERSION3_GE(6,0,0) + /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ + #define FIPS_NO_WRAPPERS + #endif +#endif + +#include +#if FIPS_VERSION3_GE(6,0,0) + const unsigned int wolfCrypt_FIPS_sha256_ro_sanity[2] = + { 0x1a2b3c4d, 0x00000014 }; + int wolfCrypt_FIPS_SHA256_sanity(void) + { + return 0; + } +#endif +#include +#include + +#include + +#ifdef NO_INLINE + #include +#else + #define WOLFSSL_MISC_INCLUDED + #include +#endif + +/* Constants to add in each round. */ +static const FLASH_QUALIFIER ALIGN32 word32 K[64] = { + 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, + 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, + 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, + 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, + 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L, + 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L, + 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, + 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, + 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L, + 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L, + 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, + 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, + 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L +}; + +/* Initialize SHA-256 object for hashing. + * + * @param [in, out] sha256 SHA-256 object. + */ +static void InitSha256(wc_Sha256* sha256) +{ + /* Set initial hash values. */ +#ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + sha256->digest[0] = 0x6A09E667L; + sha256->digest[1] = 0xBB67AE85L; + sha256->digest[2] = 0x3C6EF372L; + sha256->digest[3] = 0xA54FF53AL; + sha256->digest[4] = 0x510E527FL; + sha256->digest[5] = 0x9B05688CL; + sha256->digest[6] = 0x1F83D9ABL; + sha256->digest[7] = 0x5BE0CD19L; +#else + /* f, e, b, a, h, g, d, c */ + sha256->digest[0] = 0x9B05688CL; + sha256->digest[1] = 0x510E527FL; + sha256->digest[2] = 0xBB67AE85L; + sha256->digest[3] = 0x6A09E667L; + sha256->digest[4] = 0x5BE0CD19L; + sha256->digest[5] = 0x1F83D9ABL; + sha256->digest[6] = 0xA54FF53AL; + sha256->digest[7] = 0x3C6EF372L; +#endif + + /* No hashed data. */ + sha256->buffLen = 0; + /* No data hashed. */ + sha256->loLen = 0; + sha256->hiLen = 0; + +#ifdef WOLFSSL_HASH_FLAGS + sha256->flags = 0; +#endif +} + +/* More data hashed, add length to 64-bit cumulative total. + * + * @param [in, out] sha256 SHA-256 object. Assumed not NULL. + * @param [in] len Length to add. + */ +static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len) +{ + word32 tmp = sha256->loLen; + if ((sha256->loLen += len) < tmp) + sha256->hiLen++; /* carry low to high */ +} + +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + +/* Load a word with bytes reversed. */ +#define LOAD_WORD_REV(r, o, p, t0, t1, t2) \ + "lbu " #t0 ", " #o "(" #p ")\n\t" \ + "lbu " #t1 ", " #o "+1(" #p ")\n\t" \ + "lbu " #t2 ", " #o "+2(" #p ")\n\t" \ + "lbu " #r ", " #o "+3(" #p ")\n\t" \ + "slli " #t0 ", " #t0 ", 24\n\t" \ + "slli " #t1 ", " #t1 ", 16\n\t" \ + "slli " #t2 ", " #t2 ", 8\n\t" \ + "or " #r ", " #r ", " #t0 "\n\t" \ + "or " #r ", " #r ", " #t1 "\n\t" \ + "or " #r ", " #r ", " #t2 "\n\t" + +/* Load a word with bytes reversed. */ +#define LOAD_DWORD_REV(r, o, p, t0, t1, t2, t3) \ + "lbu " #t0 ", " #o "(" #p ")\n\t" \ + "lbu " #t1 ", " #o "+1(" #p ")\n\t" \ + "lbu " #t2 ", " #o "+2(" #p ")\n\t" \ + "lbu " #r ", " #o "+3(" #p ")\n\t" \ + "slli " #t0 ", " #t0 ", 24\n\t" \ + "slli " #t1 ", " #t1 ", 16\n\t" \ + "slli " #t2 ", " #t2 ", 8\n\t" \ + "or " #r ", " #r ", " #t0 "\n\t" \ + "or " #r ", " #r ", " #t1 "\n\t" \ + "or " #r ", " #r ", " #t2 "\n\t" \ + "lbu " #t0 ", " #o "+4(" #p ")\n\t" \ + "lbu " #t1 ", " #o "+5(" #p ")\n\t" \ + "lbu " #t2 ", " #o "+6(" #p ")\n\t" \ + "lbu " #t3 ", " #o "+7(" #p ")\n\t" \ + "slli " #t0 ", " #t0 ", 56\n\t" \ + "slli " #t1 ", " #t1 ", 48\n\t" \ + "slli " #t2 ", " #t2 ", 40\n\t" \ + "slli " #t3 ", " #t3 ", 32\n\t" \ + "or " #r ", " #r ", " #t0 "\n\t" \ + "or " #r ", " #r ", " #t1 "\n\t" \ + "or " #r ", " #r ", " #t2 "\n\t" \ + "or " #r ", " #r ", " #t3 "\n\t" + +#define PACK_BB(rd, rs1, rs2, rrd, rrs1, rrs2) \ + "slli " #rd ", " #rs1 ", 32\n\t" \ + "slli " #rs2 ", " #rs2 ", 32\n\t" \ + "srli " #rd ", " #rs1 ", 32\n\t" \ + "or " #rd ", " #rd ", " #rs2 "\n\t" + +#else + +#define PACK_BB(rd, rs1, rs2, rrd, rrs1, rrs2) \ + PACK(rrd, rrs1, rrs2) + +#endif + +#ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + +#ifdef WOLFSSL_RISCV_SCALAR_CRYPTO_ASM + +/* SHA-256 SUM0 operation. */ +#define SHA256SUM0(rd, rs1) \ + ASM_WORD((0b000100000000 << 20) | (0b001 << 12) | 0b0010011 | \ + (rs1 << 15) | (rd << 7)) +/* SHA-256 SUM1 operation. */ +#define SHA256SUM1(rd, rs1) \ + ASM_WORD((0b000100000001 << 20) | (0b001 << 12) | 0b0010011 | \ + (rs1 << 15) | (rd << 7)) +/* SHA-256 SIGMA0 operation. */ +#define SHA256SIG0(rd, rs1) \ + ASM_WORD((0b000100000010 << 20) | (0b001 << 12) | 0b0010011 | \ + (rs1 << 15) | (rd << 7)) +/* SHA-256 SIGMA1 operation. */ +#define SHA256SIG1(rd, rs1) \ + ASM_WORD((0b000100000011 << 20) | (0b001 << 12) | 0b0010011 | \ + (rs1 << 15) | (rd << 7)) + +/* One round of compression. */ +#define RND(a, b, c, d, e, f, g, h, w, k) \ + /* Get e and a */ \ + "mv a4, " #e "\n\t" \ + "mv a5, " #a "\n\t" \ + /* Sigma1(e) */ \ + SHA256SUM1(REG_A4, REG_A4) \ + /* Sigma0(a) */ \ + SHA256SUM0(REG_A5, REG_A5) \ + /* Maj(a, b, c) = t5 */ \ + /* Ch(e, f, g) = t6 */ \ + /* f ^ g */ \ + "xor t6, " #f ", " #g "\n\t" \ + /* a ^ b */ \ + "xor t4, " #a ", " #b "\n\t" \ + /* b ^ c */ \ + "xor t5, " #b ", " #c "\n\t" \ + /* (f ^ g) & e */ \ + "and t6, t6, " #e "\n\t" \ + /* h + sigma1 */ \ + "addw " #h ", " #h ", a4\n\t" \ + /* (a^b) & (b^c) */ \ + "and t5, t5, t4\n\t" \ + /* ((f ^ g) & e) ^ g */ \ + "xor t6, t6, " #g "\n\t" \ + /* K + W */ \ + "addw t4, " #k ", " #w "\n\t" \ + /* ((a^b) & (b^c)) ^ b */ \ + "xor t5, t5, " #b "\n\t" \ + /* h + sigma1 + Ch */ \ + "addw " #h ", " #h ", t6\n\t" \ + /* 't0' = h + sigma1 + Ch + K + W */ \ + "addw " #h ", " #h ", t4\n\t" \ + /* Sigma0(a) + Maj = 't1' */ \ + "addw t5, a5, t5\n\t" \ + /* d += 't0' */ \ + "addw " #d ", " #d ", " #h "\n\t" \ + /* 't0' += 't1' */ \ + "addw " #h ", " #h ", t5\n\t" + +/* Two message schedule updates. */ +#define W_UPDATE_2(w0, w1, w4, w5, w7, reg_w0, reg_w1, reg_w7) \ + /* W[i-15] = W[1] */ \ + "srli t4, " #w0 ", 32\n\t" \ + /* W[i-7] = W[9] */ \ + "srli t6, " #w4 ", 32\n\t" \ + /* Gamma0(W[1]) */ \ + SHA256SIG0(REG_A4, REG_T4) \ + /* Gamma1(W[i-2]) = Gamma1(W[14]) */ \ + SHA256SIG1(REG_A5, reg_w7) \ + /* Gamma1(W[14]) + W[9] */ \ + "addw a5, a5, t6\n\t" \ + /* Gamma0(W[1]) + W[i-16] = Gamma0(W[1]) + W[0] */ \ + "addw " #w0 ", " #w0 ", a4\n\t" \ + /* W[i+1-2] = W[15] */ \ + "srli t5, " #w7 ", 32\n\t" \ + /* W[0] = Gamma1(W[14]) + W[9] + Gamma0(W[1]) + W[0] */ \ + "addw " #w0 ", a5, " #w0 "\n\t" \ + \ + /* W[i+1-16] = W[1] = t4 */ \ + /* Gamma0(W[i+1-15]) = Gamma0(W[2]) */ \ + SHA256SIG0(REG_A6, reg_w1) \ + /* Gamma1(W[i+1-2]) = Gamma1(W[15]) */ \ + SHA256SIG1(REG_A7, REG_T5) \ + /* Gamma1(W[15]) + W[i+1-7] = Gamma1(W[15]) + W[10] */ \ + "addw a7, a7, " #w5 "\n\t" \ + /* Gamma0(W[2]) + W[i+1-16] = Gamma0(W[2]) + W[1] */ \ + "addw t5, a6, t4\n\t" \ + /* Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16] */ \ + "addw a7, a7, t5\n\t" \ + /* Place in W[i+1-16] = W[1] */ \ + PACK_BB(w0, w0, a7, reg_w0, reg_w0, REG_A7) + +#else + +/* SHA-256 SIGMA1 operation. */ +#define SHA256SIG1(rd, rs1) \ + "slliw t6, " #rs1 ", 15\n\t" \ + "srliw t5, " #rs1 ", 17\n\t" \ + "slliw t4, " #rs1 ", 13\n\t" \ + "srliw " #rd ", " #rs1 ", 19\n\t" \ + "or t6, t6, t5\n\t" \ + "srliw t5, " #rs1 ", 10\n\t" \ + "xor " #rd ", "#rd ", t4\n\t" \ + "xor t6, t6, t5\n\t" \ + "xor " #rd ", " #rd ", t6\n\t" \ + +/* One round of compression. */ +#define RND(a, b, c, d, e, f, g, h, w, k) \ + /* a4 = Sigma1(e) */ \ + "slliw t5, " #e ", 26\n\t" \ + "srliw t4, " #e ", 6\n\t" \ + "slliw t6, " #e ", 21\n\t" \ + "srliw a4, " #e ", 11\n\t" \ + "slliw a5, " #e ", 7\n\t" \ + "or t4, t4, t5\n\t" \ + "xor a4, a4, t6\n\t" \ + "srliw t5, " #e ", 25\n\t" \ + "xor t4, t4, a5\n\t" \ + "xor a4, a4, t5\n\t" \ + /* a5 = Sigma0(a) */ \ + "slliw t5, " #a ", 30\n\t" \ + "xor a4, a4, t4\n\t" \ + "srliw t4, " #a ", 2\n\t" \ + "slliw t6, " #a ", 19\n\t" \ + /* h + sigma1 */ \ + "addw " #h ", " #h ", a4\n\t" \ + "srliw a5, " #a ", 13\n\t" \ + "slliw a4, " #a ", 10\n\t" \ + "or t4, t4, t5\n\t" \ + "xor a5, a5, t6\n\t" \ + "srliw t6, " #a ", 22\n\t" \ + "xor t4, t4, a4\n\t" \ + "xor a5, a5, t6\n\t" \ + /* Maj(a, b, c) = t5 */ \ + /* Ch(e, f, g) = t6 */ \ + /* f ^ g */ \ + "xor t6, " #f ", " #g "\n\t" \ + /* a ^ b */ \ + "xor t5, " #a ", " #b "\n\t" \ + /* b ^ c */ \ + "xor a4, " #b ", " #c "\n\t" \ + "xor a5, a5, t4\n\t" \ + /* (f ^ g) & e */ \ + "and t6, t6, " #e "\n\t" \ + /* (a^b) & (b^c) */ \ + "and t5, t5, a4\n\t" \ + /* ((f ^ g) & e) ^ g */ \ + "xor t6, t6, " #g "\n\t" \ + /* K + W */ \ + "addw a4, " #k ", " #w "\n\t" \ + /* h + sigma1 + Ch */ \ + "addw " #h ", " #h ", t6\n\t" \ + /* ((a^b) & (b^c)) ^ b */ \ + "xor t5, t5, " #b "\n\t" \ + /* 't0' = h + sigma1 + Ch + K + W */ \ + "addw " #h ", " #h ", a4\n\t" \ + /* 't1' = Sigma0(a) + Maj */ \ + "addw t5, a5, t5\n\t" \ + /* d += 't0' */ \ + "addw " #d ", " #d ", " #h "\n\t" \ + /* h = 't0' + 't1' */ \ + "addw " #h ", " #h ", t5\n\t" + +/* Two message schedule updates. */ +#define W_UPDATE_2(w0, w1, w4, w5, w7, reg_w0, reg_w1, reg_w7) \ + /* W[i-15] = W[1] */ \ + "srli a7, " #w0 ", 32\n\t" \ + /* W[i-7] = W[9] */ \ + "srli a6, " #w4 ", 32\n\t" \ + /* Gamma0(W[1]) */ \ + "slliw t4, a7, 25\n\t" \ + "srliw t5, a7, 7\n\t" \ + "slliw t6, a7, 14\n\t" \ + "srliw a4, a7, 18\n\t" \ + "or t4, t4, t5\n\t" \ + "srliw t5, a7, 3\n\t" \ + "xor a4, a4, t6\n\t" \ + "xor t4, t4, t5\n\t" \ + /* Gamma1(W[i-2]) = Gamma1(W[14]) */ \ + "slliw t6, " #w7 ", 15\n\t" \ + "srliw t5, " #w7 ", 17\n\t" \ + "xor a4, a4, t4\n\t" \ + "slliw t4, " #w7 ", 13\n\t" \ + "srliw a5, " #w7 ", 19\n\t" \ + "or t6, t6, t5\n\t" \ + "srliw t5, " #w7 ", 10\n\t" \ + "xor a5, a5, t4\n\t" \ + "xor t6, t6, t5\n\t" \ + "xor a5, a5, t6\n\t" \ + /* Gamma0(W[1]) + W[i-16] = Gamma0(W[1]) + W[0] */ \ + "addw " #w0 ", " #w0 ", a4\n\t" \ + /* Gamma1(W[14]) + W[9] */ \ + "addw a5, a5, a6\n\t" \ + /* W[0] = Gamma1(W[14]) + W[9] + Gamma0(W[1]) + W[0] */ \ + "addw " #w0 ", a5, " #w0 "\n\t" \ + \ + /* W[i+1-16] = W[1] = a7 */ \ + /* W[i+1-2] = W[15] */ \ + "srli a4, " #w7 ", 32\n\t" \ + /* Gamma0(W[i+1-15]) = Gamma0(W[2]) */ \ + "slliw t4, " #w1 ", 25\n\t" \ + "srliw t5, " #w1 ", 7\n\t" \ + "slliw t6, " #w1 ", 14\n\t" \ + "srliw a6, " #w1 ", 18\n\t" \ + "or t4, t4, t5\n\t" \ + "srliw t5, " #w1 ", 3\n\t" \ + "xor a6, a6, t6\n\t" \ + "xor t4, t4, t5\n\t" \ + /* Gamma1(W[i+1-2]) = Gamma1(W[15]) */ \ + "slliw t6, a4, 15\n\t" \ + "srliw t5, a4, 17\n\t" \ + "xor a6, a6, t4\n\t" \ + "slliw t4, a4, 13\n\t" \ + "srliw a5, a4, 19\n\t" \ + "or t6, t6, t5\n\t" \ + "srliw t5, a4, 10\n\t" \ + "xor a5, a5, t4\n\t" \ + "xor t6, t6, t5\n\t" \ + "xor a5, a5, t6\n\t" \ + /* Gamma0(W[2]) + W[i+1-16] = Gamma0(W[2]) + W[1] */ \ + "addw t5, a6, a7\n\t" \ + /* Gamma1(W[15]) + W[i+1-7] = Gamma1(W[15]) + W[10] */ \ + "addw a5, a5, " #w5 "\n\t" \ + /* Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16] */ \ + "addw a5, a5, t5\n\t" \ + /* Place in W[i+1-16] = W[1] */ \ + PACK_BB(w0, w0, a5, reg_w0, reg_w0, REG_A5) + +#endif /* WOLFSSL_RISCV_SCALAR_CRYPTO_ASM */ + +/* Two rounds of compression. */ +#define RND2(a, b, c, d, e, f, g, h, w, o) \ + /* Get k[i], k[i+1] */ \ + "ld a6, " #o "(%[k])\n\t" \ + RND(a, b, c, d, e, f, g, h, w, a6) \ + /* Move k[i+1] down */ \ + "srli a6, a6, 32\n\t" \ + /* Move W[i] down */ \ + "srli a7, " #w ", 32\n\t" \ + RND(h, a, b, c, d, e, f, g, a7, a6) + +/* Sixteen rounds of compression with message scheduling. */ +#define RND16() \ + RND2(t0, t1, t2, t3, s8, s9, s10, s11, s0, 0) \ + W_UPDATE_2(s0, s1, s4, s5, s7, REG_S0, REG_S1, REG_S7) \ + RND2(s10, s11, t0, t1, t2, t3, s8, s9, s1, 8) \ + W_UPDATE_2(s1, s2, s5, s6, s0, REG_S1, REG_S2, REG_S0) \ + RND2(s8, s9, s10, s11, t0, t1, t2, t3, s2, 16) \ + W_UPDATE_2(s2, s3, s6, s7, s1, REG_S2, REG_S3, REG_S1) \ + RND2(t2, t3, s8, s9, s10, s11, t0, t1, s3, 24) \ + W_UPDATE_2(s3, s4, s7, s0, s2, REG_S3, REG_S4, REG_S2) \ + RND2(t0, t1, t2, t3, s8, s9, s10, s11, s4, 32) \ + W_UPDATE_2(s4, s5, s0, s1, s3, REG_S4, REG_S5, REG_S3) \ + RND2(s10, s11, t0, t1, t2, t3, s8, s9, s5, 40) \ + W_UPDATE_2(s5, s6, s1, s2, s4, REG_S5, REG_S6, REG_S4) \ + RND2(s8, s9, s10, s11, t0, t1, t2, t3, s6, 48) \ + W_UPDATE_2(s6, s7, s2, s3, s5, REG_S6, REG_S7, REG_S5) \ + RND2(t2, t3, s8, s9, s10, s11, t0, t1, s7, 56) \ + W_UPDATE_2(s7, s0, s3, s4, s6, REG_S7, REG_S0, REG_S6) + +/* Sixteen rounds of compression only. */ +#define RND16_LAST() \ + RND2(t0, t1, t2, t3, s8, s9, s10, s11, s0, 0) \ + RND2(s10, s11, t0, t1, t2, t3, s8, s9, s1, 8) \ + RND2(s8, s9, s10, s11, t0, t1, t2, t3, s2, 16) \ + RND2(t2, t3, s8, s9, s10, s11, t0, t1, s3, 24) \ + RND2(t0, t1, t2, t3, s8, s9, s10, s11, s4, 32) \ + RND2(s10, s11, t0, t1, t2, t3, s8, s9, s5, 40) \ + RND2(s8, s9, s10, s11, t0, t1, t2, t3, s6, 48) \ + RND2(t2, t3, s8, s9, s10, s11, t0, t1, s7, 56) + +/* Transform the message data. + * + * @param [in, out] sha256 SHA-256 object. + * @param [in] data Buffer of data to hash. + * @param [in] blocks Number of blocks of data to hash. + */ +static WC_INLINE void Sha256Transform(wc_Sha256* sha256, const byte* data, + word32 blocks) +{ + word32* k = (word32*)K; + + __asm__ __volatile__ ( + /* Load digest. */ + "ld t0, 0(%[digest])\n\t" + "ld t2, 8(%[digest])\n\t" + "ld s8, 16(%[digest])\n\t" + "ld s10, 24(%[digest])\n\t" + "srli t1, t0, 32\n\t" + "srli t3, t2, 32\n\t" + "srli s9, s8, 32\n\t" + "srli s11, s10, 32\n\t" + + /* 4 rounds of 16 per block. */ + "slli %[blocks], %[blocks], 2\n\t" + + "\n1:\n\t" + /* beginning of SHA256 block operation */ + /* Load W */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + LOAD_DWORD_REV(s0, 0, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s1, 8, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s2, 16, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s3, 24, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s4, 32, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s5, 40, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s6, 48, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s7, 56, %[data], a4, a5, a6, a7) +#else + "lwu a4, 0(%[data])\n\t" + "lwu s0, 4(%[data])\n\t" + "lwu a5, 8(%[data])\n\t" + "lwu s1, 12(%[data])\n\t" + "lwu a6, 16(%[data])\n\t" + "lwu s2, 20(%[data])\n\t" + "lwu a7, 24(%[data])\n\t" + "lwu s3, 28(%[data])\n\t" + PACK_BB(s0, s0, a4, REG_S0, REG_S0, REG_A4) + PACK_BB(s1, s1, a5, REG_S1, REG_S1, REG_A5) + PACK_BB(s2, s2, a6, REG_S2, REG_S2, REG_A6) + PACK_BB(s3, s3, a7, REG_S3, REG_S3, REG_A7) + REV8(REG_S0, REG_S0) + REV8(REG_S1, REG_S1) + REV8(REG_S2, REG_S2) + REV8(REG_S3, REG_S3) + "lwu a4, 32(%[data])\n\t" + "lwu s4, 36(%[data])\n\t" + "lwu a5, 40(%[data])\n\t" + "lwu s5, 44(%[data])\n\t" + "lwu a6, 48(%[data])\n\t" + "lwu s6, 52(%[data])\n\t" + "lwu a7, 56(%[data])\n\t" + "lwu s7, 60(%[data])\n\t" + PACK_BB(s4, s4, a4, REG_S4, REG_S4, REG_A4) + PACK_BB(s5, s5, a5, REG_S5, REG_S5, REG_A5) + PACK_BB(s6, s6, a6, REG_S6, REG_S6, REG_A6) + PACK_BB(s7, s7, a7, REG_S7, REG_S7, REG_A7) + REV8(REG_S4, REG_S4) + REV8(REG_S5, REG_S5) + REV8(REG_S6, REG_S6) + REV8(REG_S7, REG_S7) +#endif + + /* Subtract one as there are only 3 loops. */ + "addi %[blocks], %[blocks], -1\n\t" + "\n2:\n\t" + RND16() + "addi %[blocks], %[blocks], -1\n\t" + "add %[k], %[k], 64\n\t" + "andi a4, %[blocks], 3\n\t" + "bnez a4, 2b \n\t" + RND16_LAST() + "addi %[k], %[k], -192\n\t" + + "# Add working vars back into digest state.\n\t" + "ld a4, 0(%[digest])\n\t" + "ld a5, 8(%[digest])\n\t" + "ld a6, 16(%[digest])\n\t" + "ld a7, 24(%[digest])\n\t" + "addw t0, t0, a4\n\t" + "addw t2, t2, a5\n\t" + "addw s8, s8, a6\n\t" + "addw s10, s10, a7\n\t" + "srli a4, a4, 32\n\t" + "srli a5, a5, 32\n\t" + "srli a6, a6, 32\n\t" + "srli a7, a7, 32\n\t" + "addw t1, t1, a4\n\t" + "addw t3, t3, a5\n\t" + "addw s9, s9, a6\n\t" + "addw s11, s11, a7\n\t" + + /* Store digest. */ + "sw t0, 0(%[digest])\n\t" + "sw t1, 4(%[digest])\n\t" + "sw t2, 8(%[digest])\n\t" + "sw t3, 12(%[digest])\n\t" + "sw s8, 16(%[digest])\n\t" + "sw s9, 20(%[digest])\n\t" + "sw s10, 24(%[digest])\n\t" + "sw s11, 28(%[digest])\n\t" + + "add %[data], %[data], 64\n\t" + "bnez %[blocks], 1b \n\t" + + : [blocks] "+r" (blocks), [data] "+r" (data), [k] "+r" (k) + : [digest] "r" (sha256->digest) + : "cc", "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "a4", "a5", "a6", "a7", + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", + "s11" + ); +} + +#else + +/* Two rounds of compression using low two 32-bit W values. + * Assumes K has been added into W values. + */ +#define VSHA2CL_VV(vd, vs1, vs2) \ + ASM_WORD((0b101111 << 26) | (0b1 << 25) | \ + (0b010 << 12) | (0b1110111 << 0) | \ + (vd << 7) | (vs1 << 15) | (vs2 << 20)) + +/* Two rounds of compression using upper two 32-bit W values. + * Assumes K has been added into W values. + */ +#define VSHA2CH_VV(vd, vs1, vs2) \ + ASM_WORD((0b101110 << 26) | (0b1 << 25) | \ + (0b010 << 12) | (0b1110111 << 0) | \ + (vd << 7) | (vs1 << 15) | (vs2 << 20)) + +/* Update 4 W values - message scheduling. */ +#define VSHA2MS_VV(vd, vs1, vs2) \ + ASM_WORD((0b101101 << 26) | (0b1 << 25) | \ + (0b010 << 12) | (0b1110111 << 0) | \ + (vd << 7) | (vs1 << 15) | (vs2 << 20)) + +#define RND4(w0, w1, w2, w3, k) \ + /* Four rounds of compression. */ \ + VADD_VV(REG_V7, w0, k) \ + VMV_X_S(REG_T1, w1) \ + VSHA2CL_VV(REG_V5, REG_V7, REG_V4) \ + VMV_V_V(REG_V6, w2) \ + VSHA2CH_VV(REG_V4, REG_V7, REG_V5) \ + /* Update 4 W values - message schedule. */ \ + VMV_S_X(REG_V6, REG_T1) \ + VSHA2MS_VV(w0, w3, REG_V6) + +#define RND4_LAST(w, k) \ + /* Four rounds of compression. */ \ + VADD_VV(REG_V7, w, k) \ + VSHA2CL_VV(REG_V5, REG_V7, REG_V4) \ + VSHA2CH_VV(REG_V4, REG_V7, REG_V5) + +#define RND16(k) \ + RND4(REG_V0, REG_V1, REG_V2, REG_V3, (k + 0)) \ + RND4(REG_V1, REG_V2, REG_V3, REG_V0, (k + 1)) \ + RND4(REG_V2, REG_V3, REG_V0, REG_V1, (k + 2)) \ + RND4(REG_V3, REG_V0, REG_V1, REG_V2, (k + 3)) + +#define RND16_LAST(k) \ + RND4_LAST(REG_V0, (k + 0)) \ + RND4_LAST(REG_V1, (k + 1)) \ + RND4_LAST(REG_V2, (k + 2)) \ + RND4_LAST(REG_V3, (k + 3)) + +/* Transform the message data. + * + * @param [in, out] sha256 SHA-256 object. + * @param [in] data Buffer of data to hash. + * @param [in] blocks Number of blocks of data to hash. + */ +static void Sha256Transform(wc_Sha256* sha256, const byte* data, + word32 blocks) +{ + word32* k = (word32*)K; + + __asm__ __volatile__ ( + VSETIVLI(REG_ZERO, 4, 1, 1, 0b010, 0b000) + + /* Load: a|b|e|f, c|d|g|h + * 3 2 1 0 3 2 1 0 + */ + "mv t0, %[digest]\n\t" + VL2RE32_V(REG_V4, REG_T0) + + "mv t0, %[k]\n\t" + VL8RE32_V(REG_V8, REG_T0) + "addi t0, %[k], 128\n\t" + VL8RE32_V(REG_V16, REG_T0) + + "\n1:\n\t" + VMV_V_V(REG_V30, REG_V4) + VMV_V_V(REG_V31, REG_V5) + + /* Load 16 W into 4 vectors of 4 32-bit words. */ + "mv t0, %[data]\n\t" + VL4RE32_V(REG_V0, REG_T0) + VREV8(REG_V0, REG_V0) + VREV8(REG_V1, REG_V1) + VREV8(REG_V2, REG_V2) + VREV8(REG_V3, REG_V3) + + RND16(REG_V8) + RND16(REG_V12) + RND16(REG_V16) + RND16_LAST(REG_V20) + + VADD_VV(REG_V4, REG_V4, REG_V30) + VADD_VV(REG_V5, REG_V5, REG_V31) + + "addi %[blocks], %[blocks], -1\n\t" + "add %[data], %[data], 64\n\t" + "bnez %[blocks], 1b \n\t" + + "mv t0, %[digest]\n\t" + VS2R_V(REG_V4, REG_T0) + + : [blocks] "+r" (blocks), [data] "+r" (data), [k] "+r" (k) + : [digest] "r" (sha256->digest) + : "cc", "memory", "t0", "t1" + ); +} + +#endif /* WOLFSSL_RISCV_VECTOR_CRYPTO_ASM */ + +/* Update the hash with data. + * + * @param [in, out] sha256 SHA-256 object. + * @param [in] data Buffer of data to hash. + * @param [in] len Number of bytes in buffer to hash. + * @return 0 on success. + */ +static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, + word32 len) +{ + word32 add; + word32 blocks; + + /* only perform actions if a buffer is passed in */ + if (len > 0) { + AddLength(sha256, len); + + if (sha256->buffLen > 0) { + /* fill leftover buffer with data */ + add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen); + XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add); + sha256->buffLen += add; + data += add; + len -= add; + if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) { + Sha256Transform(sha256, (byte*)sha256->buffer, 1); + sha256->buffLen = 0; + } + } + + /* number of blocks in a row to complete */ + blocks = len / WC_SHA256_BLOCK_SIZE; + + if (blocks > 0) { + Sha256Transform(sha256, data, blocks); + data += blocks * WC_SHA256_BLOCK_SIZE; + len -= blocks * WC_SHA256_BLOCK_SIZE; + } + + if (len > 0) { + /* copy over any remaining data leftover */ + XMEMCPY(sha256->buffer, data, len); + sha256->buffLen = len; + } + } + + /* account for possibility of not used if len = 0 */ + (void)add; + (void)blocks; + + return 0; +} + +/* Finalize the hash and put into buffer. + * + * @param [in, out] sha256 SHA-256 object. + * @param [out] hash Buffer to hold hash result. + */ +static WC_INLINE void Sha256Final(wc_Sha256* sha256, byte* hash) +{ + byte* local; + + local = (byte*)sha256->buffer; + local[sha256->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha256->buffLen > WC_SHA256_PAD_SIZE) { + XMEMSET(&local[sha256->buffLen], 0, + WC_SHA256_BLOCK_SIZE - sha256->buffLen); + Sha256Transform(sha256, (byte*)sha256->buffer, 1); + sha256->buffLen = 0; + } + XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen); + + /* put lengths in bits */ + sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) + + (sha256->hiLen << 3); + sha256->loLen = sha256->loLen << 3; + + XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); + XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, + sizeof(word32)); + + /* store lengths */ + __asm__ __volatile__ ( + /* Reverse byte order of 32-bit words. */ +#if defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION) + "ld t1, 56(%[buff])\n\t" + REV8(REG_T1, REG_T1) + "srli t0, t1, 32\n\t" + "sw t0, 56(%[buff])\n\t" + "sw t1, 60(%[buff])\n\t" +#else + LOAD_WORD_REV(t0, 56, %[buff], t2, t3, t4) + LOAD_WORD_REV(t1, 60, %[buff], t2, t3, t4) + "sw t0, 56(%[buff])\n\t" + "sw t1, 60(%[buff])\n\t" +#endif + : + : [buff] "r" (sha256->buffer) + : "cc", "memory", "t0", "t1", "t2", "t3", "t4" + ); + + Sha256Transform(sha256, (byte*)sha256->buffer, 1); + + __asm__ __volatile__ ( + /* Reverse byte order of 32-bit words. */ +#if defined(WOLFSSL_RISCV_VECTOR_CRYPTO_ASM) + VSETIVLI(REG_ZERO, 4, 1, 1, 0b010, 0b000) + "mv t0, %[digest]\n\t" + VL2RE32_V(REG_V8, REG_T0) + VREV8(REG_V8, REG_V8) + VREV8(REG_V9, REG_V9) + /* a|b|e|f, c|d|g|h + * 3 2 1 0 3 2 1 0 */ + VSLIDEDOWN_VI(REG_V0, REG_V8, 3) /* a */ + VSLIDEDOWN_VI(REG_V2, REG_V8, 2) /* b */ + VSLIDEDOWN_VI(REG_V1, REG_V8, 1) /* e */ + VSLIDEDOWN_VI(REG_V3, REG_V9, 3) /* c */ + VSLIDEDOWN_VI(REG_V4, REG_V9, 2) /* d */ + VSLIDEDOWN_VI(REG_V5, REG_V9, 1) /* g */ + /* -|-|-|a, -|-|-|e */ + VSLIDEUP_VI(REG_V0, REG_V2, 1) + /* -|-|b|a, -|-|-|e */ + VSLIDEUP_VI(REG_V0, REG_V3, 2) + /* -|c|b|a, -|-|-|e */ + VSLIDEUP_VI(REG_V0, REG_V4, 3) + /* d|c|b|a, -|-|-|e */ + VSLIDEUP_VI(REG_V1, REG_V8, 1) + /* d|c|b|a, -|-|f|e */ + VSLIDEUP_VI(REG_V1, REG_V5, 2) + /* d|c|b|a, -|g|f|e */ + VSLIDEUP_VI(REG_V1, REG_V9, 3) + /* d|c|b|a, h|g|f|e */ + "mv t0, %[hash]\n\t" + VS2R_V(REG_V0, REG_T0) +#elif defined(WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION) + VSETIVLI(REG_ZERO, 4, 1, 1, 0b010, 0b000) + "mv t0, %[digest]\n\t" + VL2RE32_V(REG_V0, REG_T0) + VREV8(REG_V0, REG_V0) + VREV8(REG_V1, REG_V1) + "mv t0, %[hash]\n\t" + VS2R_V(REG_V0, REG_T0) +#elif defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION) + "ld t1, 0(%[digest])\n\t" + "ld t3, 8(%[digest])\n\t" + "ld a5, 16(%[digest])\n\t" + "ld a7, 24(%[digest])\n\t" + REV8(REG_T1, REG_T1) + REV8(REG_T3, REG_T3) + REV8(REG_A5, REG_A5) + REV8(REG_A7, REG_A7) + "srli t0, t1, 32\n\t" + "srli t2, t3, 32\n\t" + "srli a4, a5, 32\n\t" + "srli a6, a7, 32\n\t" + "sw t0, 0(%[hash])\n\t" + "sw t1, 4(%[hash])\n\t" + "sw t2, 8(%[hash])\n\t" + "sw t3, 12(%[hash])\n\t" + "sw a4, 16(%[hash])\n\t" + "sw a5, 20(%[hash])\n\t" + "sw a6, 24(%[hash])\n\t" + "sw a7, 28(%[hash])\n\t" +#else + LOAD_WORD_REV(t0, 0, %[digest], t2, t3, t4) + LOAD_WORD_REV(t1, 4, %[digest], t2, t3, t4) + LOAD_WORD_REV(a4, 8, %[digest], t2, t3, t4) + LOAD_WORD_REV(a5, 12, %[digest], t2, t3, t4) + "sw t0, 0(%[hash])\n\t" + "sw t1, 4(%[hash])\n\t" + "sw a4, 8(%[hash])\n\t" + "sw a5, 12(%[hash])\n\t" + LOAD_WORD_REV(t0, 16, %[digest], t2, t3, t4) + LOAD_WORD_REV(t1, 20, %[digest], t2, t3, t4) + LOAD_WORD_REV(a4, 24, %[digest], t2, t3, t4) + LOAD_WORD_REV(a5, 28, %[digest], t2, t3, t4) + "sw t0, 16(%[hash])\n\t" + "sw t1, 20(%[hash])\n\t" + "sw a4, 24(%[hash])\n\t" + "sw a5, 28(%[hash])\n\t" +#endif + : + : [digest] "r" (sha256->digest), [hash] "r" (hash) + : "cc", "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "a4", "a5", "a6", "a7" + ); +} + + +#ifndef NO_SHA256 + +/* Initialize SHA-256 object for hashing. + * + * @param [in, out] sha256 SHA-256 object. + * @param [in] heap Dynamic memory hint. + * @param [in] devId Device Id. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha256 is NULL. + */ +int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) +{ + int ret = 0; + + /* Validate parameters. */ + if (sha256 == NULL) { + ret = BAD_FUNC_ARG; + } + else { + sha256->heap = heap; + #ifdef WOLF_CRYPTO_CB + sha256->devId = devId; + #endif + (void)devId; + + InitSha256(sha256); + } + + return ret; +} + +/* Initialize SHA-256 object for hashing. + * + * @param [in, out] sha256 SHA-256 object. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha256 is NULL. + */ +int wc_InitSha256(wc_Sha256* sha256) +{ + return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID); +} + +/* Free the SHA-256 hash. + * + * @param [in] sha256 SHA-256 object. + */ +void wc_Sha256Free(wc_Sha256* sha256) +{ + /* No dynamic memory allocated. */ + (void)sha256; +} + +/* Update the hash with data. + * + * @param [in, out] sha256 SHA-256 object. + * @param [in] data Buffer of data to hash. + * @param [in] len Number of bytes in buffer to hash. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha256 is NULL. + * @return BAD_FUNC_ARG when data is NULL but len is not 0. + */ +int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) +{ + int ret; + + /* Validate parameters. */ + if ((sha256 == NULL) || ((data == NULL) && (len != 0))) { + ret = BAD_FUNC_ARG; + } + else { + ret = Sha256Update(sha256, data, len); + } + + return ret; +} + +/* Put the current hash into buffer. + * + * @param [in, out] sha256 SHA-256 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha256 or hash is NULL. + */ +int wc_Sha256FinalRaw(wc_Sha256* sha256, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha256 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + #ifdef LITTLE_ENDIAN_ORDER + word32 digest[WC_SHA256_DIGEST_SIZE / sizeof(word32)]; + + ByteReverseWords((word32*)digest, (word32*)sha256->digest, + WC_SHA256_DIGEST_SIZE); + XMEMCPY(hash, digest, WC_SHA256_DIGEST_SIZE); + #else + XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); + #endif + } + + return ret; +} + +/* Finalize the hash and put into buffer. + * + * @param [in, out] sha256 SHA-256 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha256 or hash is NULL. + */ +int wc_Sha256Final(wc_Sha256* sha256, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha256 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + /* Finalize hash. */ + Sha256Final(sha256, hash); + /* Restart SHA-256 object for next hash. */ + InitSha256(sha256); + } + + return ret; +} + +/* Finalize the hash and put into buffer but don't modify state. + * + * @param [in, out] sha256 SHA-256 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha256 or hash is NULL. + */ +int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash) +{ + int ret; + + /* Validate parameters. */ + if ((sha256 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + wc_Sha256 tmpSha256; + /* Create a copy of the hash to finalize. */ + ret = wc_Sha256Copy(sha256, &tmpSha256); + if (ret == 0) { + /* Finalize copy. */ + Sha256Final(&tmpSha256, hash); + } + } + + return ret; +} + +#ifdef WOLFSSL_HASH_FLAGS +/* Set flags of SHA-256 object. + * + * @param [in, out] sha256 SHA-256 object. + * @param [in] flags Flags to set. + * @return 0 on success. + */ +int wc_Sha256SetFlags(wc_Sha256* sha256, word32 flags) +{ + /* Check we have an object to use. */ + if (sha256 != NULL) { + sha256->flags = flags; + } + return 0; +} +/* Get flags of SHA-256 object. + * + * @param [in] sha256 SHA-256 object. + * @param [out] flags Flags from SHA-256 object. + * @return 0 on success. + */ +int wc_Sha256GetFlags(wc_Sha256* sha256, word32* flags) +{ + /* Check we have an object and return parameter to use. */ + if ((sha256 != NULL) && (flags != NULL)) { + *flags = sha256->flags; + } + return 0; +} +#endif + +/* Deep copy the SHA-256 object. + * + * @param [in] src SHA-256 object to copy. + * @param [out] dst SHA-256 object to fill. + * @return 0 on success. + * @return BAD_FUNC_ARG when src or dst is NULL. + */ +int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst) +{ + int ret = 0; + + /* Validate parameters. */ + if ((src == NULL) || (dst == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + XMEMCPY(dst, src, sizeof(wc_Sha256)); + } + + return ret; +} + +#ifdef OPENSSL_EXTRA +/* Update the hash with one block of data. + * + * @param [in, out] sha256 SHA-256 object. + * @param [in] data Buffer of data to hash. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha256 or data is NULL. + */ +int wc_Sha256Transform(wc_Sha256* sha256, const unsigned char* data) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha256 == NULL) || (data == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + #ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords(sha256->buffer, (word32*)data, WC_SHA256_BLOCK_SIZE); + #else + XMEMCPY(sha256->buffer, data, WC_SHA256_BLOCK_SIZE); + #endif + Sha256Transform(sha256, (byte*)sha256->buffer, 1); + } + + return ret; +} +#endif + +#if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_LMS_FULL_HASH) +/* Update the hash with one block of data and optionally get hash. + * + * @param [in, out] sha256 SHA-256 object. + * @param [in] data Buffer of data to hash. + * @param [out] hash Buffer to hold hash. May be NULL. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha256 or data is NULL. + */ +int wc_Sha256HashBlock(wc_Sha256* sha256, const unsigned char* data, + unsigned char* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha256 == NULL) || (data == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + /* Hash block. */ + Sha256Transform(sha256, data, 1); + + if (hash != NULL) { + /* Reverse bytes in digest. */ + #ifdef LITTLE_ENDIAN_ORDER + word32* hash32 = (word32*)hash; + word32* digest = (word32*)sha256->digest; + hash32[0] = ByteReverseWord32(digest[0]); + hash32[1] = ByteReverseWord32(digest[1]); + hash32[2] = ByteReverseWord32(digest[2]); + hash32[3] = ByteReverseWord32(digest[3]); + hash32[4] = ByteReverseWord32(digest[4]); + hash32[5] = ByteReverseWord32(digest[5]); + hash32[6] = ByteReverseWord32(digest[6]); + hash32[7] = ByteReverseWord32(digest[7]); + #else + XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); + #endif + /* Reset state. */ + #ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + sha256->digest[0] = 0x6A09E667L; + sha256->digest[1] = 0xBB67AE85L; + sha256->digest[2] = 0x3C6EF372L; + sha256->digest[3] = 0xA54FF53AL; + sha256->digest[4] = 0x510E527FL; + sha256->digest[5] = 0x9B05688CL; + sha256->digest[6] = 0x1F83D9ABL; + sha256->digest[7] = 0x5BE0CD19L; + #else + /* f, e, b, a, h, g, d, c */ + sha256->digest[0] = 0x9B05688CL; + sha256->digest[1] = 0x510E527FL; + sha256->digest[2] = 0xBB67AE85L; + sha256->digest[3] = 0x6A09E667L; + sha256->digest[4] = 0x5BE0CD19L; + sha256->digest[5] = 0x1F83D9ABL; + sha256->digest[6] = 0xA54FF53AL; + sha256->digest[7] = 0x3C6EF372L; + #endif + } + } + + return ret; +} +#endif /* WOLFSSL_HAVE_LMS && !WOLFSSL_LMS_FULL_HASH */ + +#endif /* !NO_SHA256 */ + + +#ifdef WOLFSSL_SHA224 + +/* Initialize SHA-224 object for hashing. + * + * @param [in, out] sha224 SHA-224 object. + */ +static void InitSha224(wc_Sha224* sha224) +{ + /* Set initial hash values. */ +#ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + sha224->digest[0] = 0xc1059ed8; + sha224->digest[1] = 0x367cd507; + sha224->digest[2] = 0x3070dd17; + sha224->digest[3] = 0xf70e5939; + sha224->digest[4] = 0xffc00b31; + sha224->digest[5] = 0x68581511; + sha224->digest[6] = 0x64f98fa7; + sha224->digest[7] = 0xbefa4fa4; +#else + /* f, e, b, a, h, g, d, c */ + sha224->digest[0] = 0x68581511; + sha224->digest[1] = 0xffc00b31; + sha224->digest[2] = 0x367cd507; + sha224->digest[3] = 0xc1059ed8; + sha224->digest[4] = 0xbefa4fa4; + sha224->digest[5] = 0x64f98fa7; + sha224->digest[6] = 0xf70e5939; + sha224->digest[7] = 0x3070dd17; +#endif + + /* No hashed data. */ + sha224->buffLen = 0; + /* No data hashed. */ + sha224->loLen = 0; + sha224->hiLen = 0; + +#ifdef WOLFSSL_HASH_FLAGS + sha224->flags = 0; +#endif +} + +/* Initialize SHA-224 object for hashing. + * + * @param [in, out] sha224 SHA-224 object. + * @param [in] heap Dynamic memory hint. + * @param [in] devId Device Id. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha224 is NULL. + */ +int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId) +{ + int ret = 0; + + /* Validate parameters. */ + if (sha224 == NULL) { + ret = BAD_FUNC_ARG; + } + else { + sha224->heap = heap; + (void)devId; + + InitSha224(sha224); + } + + return ret; +} + +/* Initialize SHA-224 object for hashing. + * + * @param [in, out] sha224 SHA-224 object. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha224 is NULL. + */ +int wc_InitSha224(wc_Sha224* sha224) +{ + return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID); +} + +/* Update the hash with data. + * + * @param [in, out] sha224 SHA-224 object. + * @param [in] data Buffer of data to hash. + * @param [in] len Number of bytes in buffer to hash. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha224 is NULL. + * @return BAD_FUNC_ARG when data is NULL but len is not 0. + */ +int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len) +{ + int ret; + + /* Validate parameters. */ + if ((sha224 == NULL) || ((data == NULL) && (len > 0))) { + ret = BAD_FUNC_ARG; + } + else { + ret = Sha256Update((wc_Sha256 *)sha224, data, len); + } + + return ret; +} + +/* Finalize the hash and put into buffer. + * + * @param [in, out] sha224 SHA-224 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha224 or hash is NULL. + */ +int wc_Sha224Final(wc_Sha224* sha224, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha224 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + word32 hashTmp[WC_SHA256_DIGEST_SIZE/sizeof(word32)]; + /* Finalize hash. */ + Sha256Final((wc_Sha256*)sha224, (byte*)hashTmp); + /* Return only 224 bits. */ + XMEMCPY(hash, hashTmp, WC_SHA224_DIGEST_SIZE); + /* Restart SHA-256 object for next hash. */ + InitSha224(sha224); + } + + return ret; +} + +/* Free the SHA-224 hash. + * + * @param [in] sha224 SHA-224 object. + */ +void wc_Sha224Free(wc_Sha224* sha224) +{ + /* No dynamic memory allocated. */ + (void)sha224; +} + +/* Finalize the hash and put into buffer but don't modify state. + * + * @param [in, out] sha224 SHA-224 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha224 or hash is NULL. + */ +int wc_Sha224GetHash(wc_Sha224* sha224, byte* hash) +{ + int ret; + + /* Validate parameters. */ + if ((sha224 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + wc_Sha224 tmpSha224; + /* Create a copy of the hash to finalize. */ + ret = wc_Sha224Copy(sha224, &tmpSha224); + if (ret == 0) { + /* Finalize copy. */ + ret = wc_Sha224Final(&tmpSha224, hash); + } + } + + return ret; +} + +#ifdef WOLFSSL_HASH_FLAGS +/* Set flags of SHA-224 object. + * + * @param [in, out] sha224 SHA-224 object. + * @param [in] flags Flags to set. + * @return 0 on success. + */ +int wc_Sha224SetFlags(wc_Sha224* sha224, word32 flags) +{ + /* Check we have an object to use. */ + if (sha224 != NULL) { + sha224->flags = flags; + } + return 0; +} +/* Get flags of SHA-224 object. + * + * @param [in] sha224 SHA-224 object. + * @param [out] flags Flags from SHA-224 object. + * @return 0 on success. + */ +int wc_Sha224GetFlags(wc_Sha224* sha224, word32* flags) +{ + /* Check we have an object and return parameter to use. */ + if ((sha224 != NULL) && (flags != NULL)) { + *flags = sha224->flags; + } + return 0; +} +#endif + +/* Deep copy the SHA-224 object. + * + * @param [in] src SHA-224 object to copy. + * @param [out] dst SHA-224 object to fill. + * @return 0 on success. + * @return BAD_FUNC_ARG when src or dst is NULL. + */ +int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst) +{ + int ret = 0; + + /* Validate parameters. */ + if ((src == NULL) || (dst == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + XMEMCPY(dst, src, sizeof(wc_Sha224)); + } + + return ret; +} + +#endif /* WOLFSSL_SHA224 */ + +#endif /* !NO_SHA256 || WOLFSSL_SHA224 */ +#endif /* WOLFSSL_RISCV_ASM */ diff --git a/wolfcrypt/src/port/riscv/riscv-64-sha3.c b/wolfcrypt/src/port/riscv/riscv-64-sha3.c new file mode 100644 index 000000000..e6e73dd55 --- /dev/null +++ b/wolfcrypt/src/port/riscv/riscv-64-sha3.c @@ -0,0 +1,863 @@ +/* riscv-64-sha3.c + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include +#include + +#ifdef WOLFSSL_RISCV_ASM +#if defined(WOLFSSL_SHA3) && !defined(WOLFSSL_XILINX_CRYPT) && \ + !defined(WOLFSSL_AFALG_XILINX_SHA3) + +#if FIPS_VERSION3_GE(2,0,0) + /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ + #define FIPS_NO_WRAPPERS + + #ifdef USE_WINDOWS_API + #pragma code_seg(".fipsA$n") + #pragma const_seg(".fipsB$n") + #endif +#endif + +#include + +static const word64 hash_keccak_r[24] = +{ + 0x0000000000000001UL, 0x0000000000008082UL, + 0x800000000000808aUL, 0x8000000080008000UL, + 0x000000000000808bUL, 0x0000000080000001UL, + 0x8000000080008081UL, 0x8000000000008009UL, + 0x000000000000008aUL, 0x0000000000000088UL, + 0x0000000080008009UL, 0x000000008000000aUL, + 0x000000008000808bUL, 0x800000000000008bUL, + 0x8000000000008089UL, 0x8000000000008003UL, + 0x8000000000008002UL, 0x8000000000000080UL, + 0x000000000000800aUL, 0x800000008000000aUL, + 0x8000000080008081UL, 0x8000000000008080UL, + 0x0000000080000001UL, 0x8000000080008008UL +}; + +#ifndef WOLFSSL_RISCV_VECTOR + +#define S0_0 "a1" +#define S0_1 "a2" +#define S0_2 "a3" +#define S0_3 "a4" +#define S0_4 "a5" +#define S1_0 "s1" +#define S1_1 "s2" +#define S1_2 "s3" +#define S1_3 "s4" +#define S1_4 "s5" +#define S2_0 "s6" +#define S2_1 "s7" +#define S2_2 "s8" +#define S2_3 "s9" +#define S2_4 "s10" +#define S3_0 "t0" +#define S3_1 "t1" +#define S3_2 "t2" +#define S3_3 "t3" +#define S3_4 "t4" + +#define T_0 "a6" +#define T_1 "a7" +#define T_2 "t5" +#define T_3 "t6" +#define T_4 "s11" + +#define SR0_0 REG_A1 +#define SR0_1 REG_A2 +#define SR0_2 REG_A3 +#define SR0_3 REG_A4 +#define SR0_4 REG_A5 +#define SR1_0 REG_S1 +#define SR1_1 REG_S2 +#define SR1_2 REG_S3 +#define SR1_3 REG_S4 +#define SR1_4 REG_S5 +#define SR2_0 REG_S6 +#define SR2_1 REG_S7 +#define SR2_2 REG_S8 +#define SR2_3 REG_S9 +#define SR2_4 REG_S10 +#define SR3_0 REG_T0 +#define SR3_1 REG_T1 +#define SR3_2 REG_T2 +#define SR3_3 REG_T3 +#define SR3_4 REG_T4 + +#define TR_0 REG_A6 +#define TR_1 REG_A7 +#define TR_2 REG_T5 +#define TR_3 REG_T6 +#define TR_4 REG_S11 + +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + +#define SWAP_ROTL(t0, tr0, t1, s, sr, rr, rl) \ + "mv " t1 ", " s "\n\t" \ + "srli " s ", " t0 ", " #rr "\n\t" \ + "slli " t0 ", " t0 ", " #rl "\n\t" \ + "or " s ", " s ", " t0 "\n\t" + +#define SWAP_ROTL_MEM(t0, tr0, t1, t2, s, rr, rl) \ + "ld " t1 ", " #s "(%[s])\n\t" \ + "srli " t2 ", " t0 ", " #rr "\n\t" \ + "slli " t0 ", " t0 ", " #rl "\n\t" \ + "or " t0 ", " t0 ", " t2 "\n\t" \ + "sd " t0 ", " #s "(%[s])\n\t" + +#else + +#define SWAP_ROTL(t0, tr0, t1, s, sr, rr, rl) \ + "mv " t1 ", " s "\n\t" \ + RORI(sr, tr0, rr) + +#define SWAP_ROTL_MEM(t0, tr0, t1, t2, s, rr, rl) \ + "ld " t1 ", " #s "(%[s])\n\t" \ + RORI(tr0, tr0, rr) \ + "sd " t0 ", " #s "(%[s])\n\t" + +#endif + +void BlockSha3(word64* s) +{ + const word64* r = hash_keccak_r; + + __asm__ __volatile__ ( + "addi sp, sp, -24\n\t" + "li " T_4 ", 24\n\t" + "ld " S0_0 ", 0(%[s])\n\t" + "ld " S0_1 ", 8(%[s])\n\t" + "ld " S0_2 ", 16(%[s])\n\t" + "ld " S0_3 ", 24(%[s])\n\t" + "ld " S0_4 ", 32(%[s])\n\t" + "ld " S1_0 ", 40(%[s])\n\t" + "ld " S1_1 ", 48(%[s])\n\t" + "ld " S1_2 ", 56(%[s])\n\t" + "ld " S1_3 ", 64(%[s])\n\t" + "ld " S1_4 ", 72(%[s])\n\t" + "ld " S2_0 ", 80(%[s])\n\t" + "ld " S2_1 ", 88(%[s])\n\t" + "ld " S2_2 ", 96(%[s])\n\t" + "ld " S2_3 ", 104(%[s])\n\t" + "ld " S2_4 ", 112(%[s])\n\t" + "ld " S3_0 ", 120(%[s])\n\t" + "ld " S3_1 ", 128(%[s])\n\t" + "ld " S3_2 ", 136(%[s])\n\t" + "ld " S3_3 ", 144(%[s])\n\t" + "ld " S3_4 ", 152(%[s])\n\t" + "ld " T_0 ", 160(%[s])\n\t" + "ld " T_1 ", 168(%[s])\n\t" + "ld " T_2 ", 176(%[s])\n\t" + "\n" + "L_riscv_64_block_sha3_loop:\n\t" + "sd " T_4 ", 16(sp)\n\t" + + /* COLUMN MIX */ + /* Calc b[0], b[1], b[2], b[3], b[4] */ + "ld " T_3 ", 184(%[s])\n\t" + "ld " T_4 ", 192(%[s])\n\t" + "xor " T_0 ", " T_0 ", " S0_0 "\n\t" + "xor " T_1 ", " T_1 ", " S0_1 "\n\t" + "xor " T_2 ", " T_2 ", " S0_2 "\n\t" + "xor " T_3 ", " T_3 ", " S0_3 "\n\t" + "xor " T_4 ", " T_4 ", " S0_4 "\n\t" + "xor " T_0 ", " T_0 ", " S1_0 "\n\t" + "xor " T_1 ", " T_1 ", " S1_1 "\n\t" + "xor " T_2 ", " T_2 ", " S1_2 "\n\t" + "xor " T_3 ", " T_3 ", " S1_3 "\n\t" + "xor " T_4 ", " T_4 ", " S1_4 "\n\t" + "xor " T_0 ", " T_0 ", " S2_0 "\n\t" + "xor " T_1 ", " T_1 ", " S2_1 "\n\t" + "xor " T_2 ", " T_2 ", " S2_2 "\n\t" + "xor " T_3 ", " T_3 ", " S2_3 "\n\t" + "xor " T_4 ", " T_4 ", " S2_4 "\n\t" + "xor " T_0 ", " T_0 ", " S3_0 "\n\t" + "xor " T_1 ", " T_1 ", " S3_1 "\n\t" + "xor " T_2 ", " T_2 ", " S3_2 "\n\t" + "xor " T_3 ", " T_3 ", " S3_3 "\n\t" + "xor " T_4 ", " T_4 ", " S3_4 "\n\t" + "sd " T_1 ", 0(sp)\n\t" + "sd " T_3 ", 8(sp)\n\t" + /* T_0, T_1, T_2, T_3, T_4 */ + + /* s[0],s[5],s[10],s[15],s[20] ^= b[4] ^ ROTL(b[1], 1) */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "srli " T_3 ", " T_1 ", 63\n\t" + "slli " T_1 ", " T_1 ", 1\n\t" + "or " T_1 ", " T_1 ", " T_3 "\n\t" +#else + RORI(TR_1, TR_1, 63) +#endif + "ld " T_3 ", 160(%[s])\n\t" + "xor " T_1 ", " T_1 ", " T_4 "\n\t" + "xor " S0_0 ", " S0_0 ", " T_1 "\n\t" + "xor " S1_0 ", " S1_0 ", " T_1 "\n\t" + "xor " T_3 ", " T_3 ", " T_1 "\n\t" + "xor " S2_0 ", " S2_0 ", " T_1 "\n\t" + "xor " S3_0 ", " S3_0 ", " T_1 "\n\t" + "sd " T_3 ", 160(%[s])\n\t" + /* T_0, T_2, T_4 */ + + /* s[1],s[6],s[11],s[16],s[21] ^= b[0] ^ ROTL(b[2], 1)*/ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "srli " T_3 ", " T_2 ", 63\n\t" + "slli " T_1 ", " T_2 ", 1\n\t" + "or " T_1 ", " T_1 ", " T_3 "\n\t" +#else + RORI(TR_1, TR_2, 63) +#endif + "ld " T_3 ", 168(%[s])\n\t" + "xor " T_1 ", " T_1 ", " T_0 "\n\t" + "xor " S0_1 ", " S0_1 ", " T_1 "\n\t" + "xor " S1_1 ", " S1_1 ", " T_1 "\n\t" + "xor " T_3 ", " T_3 ", " T_1 "\n\t" + "xor " S2_1 ", " S2_1 ", " T_1 "\n\t" + "xor " S3_1 ", " S3_1 ", " T_1 "\n\t" + "sd " T_3 ", 168(%[s])\n\t" + /* T_0, T_2, T_4 */ + + /* s[3],s[8],s[13],s[18],s[23] ^= b[2] ^ ROTL(b[4], 1) */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "srli " T_3 ", " T_4 ", 63\n\t" + "slli " T_4 ", " T_4 ", 1\n\t" + "or " T_4 ", " T_4 ", " T_3 "\n\t" +#else + RORI(TR_4, TR_4, 63) +#endif + "ld " T_3 ", 184(%[s])\n\t" + "xor " T_4 ", " T_4 ", " T_2 "\n\t" + "xor " S0_3 ", " S0_3 ", " T_4 "\n\t" + "xor " S1_3 ", " S1_3 ", " T_4 "\n\t" + "xor " T_3 ", " T_3 ", " T_4 "\n\t" + "xor " S2_3 ", " S2_3 ", " T_4 "\n\t" + "xor " S3_3 ", " S3_3 ", " T_4 "\n\t" + "sd " T_3 ", 184(%[s])\n\t" + /* T_0, T_2 */ + + "ld " T_3 ", 8(sp)\n\t" + /* s[4],s[9],s[14],s[19],s[24] ^= b[3] ^ ROTL(b[0], 1) */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "srli " T_2 ", " T_0 ", 63\n\t" + "slli " T_0 ", " T_0 ", 1\n\t" + "or " T_0 ", " T_0 ", " T_2 "\n\t" +#else + RORI(TR_0, TR_0, 63) +#endif + "ld " T_4 ", 192(%[s])\n\t" + "xor " T_0 ", " T_0 ", " T_3 "\n\t" + "xor " S0_4 ", " S0_4 ", " T_0 "\n\t" + "xor " S1_4 ", " S1_4 ", " T_0 "\n\t" + "xor " T_4 ", " T_4 ", " T_0 "\n\t" + "xor " S2_4 ", " S2_4 ", " T_0 "\n\t" + "xor " S3_4 ", " S3_4 ", " T_0 "\n\t" + /* T_3 */ + + "ld " T_1 ", 0(sp)\n\t" + /* s[2],s[7],s[12],s[17],s[22] ^= b[1] ^ ROTL(b[3], 1) */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "srli " T_2 ", " T_3 ", 63\n\t" + "slli " T_3 ", " T_3 ", 1\n\t" + "or " T_3 ", " T_3 ", " T_2 "\n\t" +#else + RORI(TR_3, TR_3, 63) +#endif + "ld " T_2 ", 176(%[s])\n\t" + "xor " T_3 ", " T_3 ", " T_1 "\n\t" + "xor " S0_2 ", " S0_2 ", " T_3 "\n\t" + "xor " S1_2 ", " S1_2 ", " T_3 "\n\t" + "xor " T_2 ", " T_2 ", " T_3 "\n\t" + "xor " S2_2 ", " S2_2 ", " T_3 "\n\t" + "xor " S3_2 ", " S3_2 ", " T_3 "\n\t" + + /* SWAP ROTL */ + /* t0 = s[10], s[10] = s[1] >>> 63 */ + "mv " T_0 ", " S2_0 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "srli " T_1 ", " S0_1 ", 63\n\t" + "slli " S2_0 ", " S0_1 ", 1\n\t" + "or " S2_0 ", " S2_0 ", " T_1 "\n\t" +#else + RORI(SR2_0, SR0_1, 63) +#endif + /* t1 = s[ 7], s[ 7] = t0 >>> 61 */ + SWAP_ROTL(T_0, TR_0, T_1, S1_2, SR1_2, 61, 3) + /* t0 = s[11], s[11] = t1 >>> 58 */ + SWAP_ROTL(T_1, TR_1, T_0, S2_1, SR2_1, 58, 6) + /* t1 = s[17], s[17] = t0 >>> 54 */ + SWAP_ROTL(T_0, TR_0, T_1, S3_2, SR3_2, 54, 10) + /* t0 = s[18], s[18] = t1 >>> 49 */ + SWAP_ROTL(T_1, TR_1, T_0, S3_3, SR3_3, 49, 15) + /* t1 = s[ 3], s[ 3] = t0 >>> 43 */ + SWAP_ROTL(T_0, TR_0, T_1, S0_3, SR0_3, 43, 21) + /* t0 = s[ 5], s[ 5] = t1 >>> 36 */ + SWAP_ROTL(T_1, TR_1, T_0, S1_0, SR1_0, 36, 28) + /* t1 = s[16], s[16] = t0 >>> 28 */ + SWAP_ROTL(T_0, TR_0, T_1, S3_1, SR3_1, 28, 36) + /* t0 = s[ 8], s[ 8] = t1 >>> 19 */ + SWAP_ROTL(T_1, TR_1, T_0, S1_3, SR1_3, 19, 45) + /* t1 = s[21], s[21] = t0 >>> 9 */ + SWAP_ROTL_MEM(T_0, TR_0, T_1, T_3, 168, 9, 55) + /* t0 = s[24], s[24] = t1 >>> 62 */ + SWAP_ROTL(T_1, TR_1, T_0, T_4, TR_4, 62, 2) + /* t1 = s[ 4], s[ 4] = t0 >>> 50 */ + SWAP_ROTL(T_0, TR_0, T_1, S0_4, SR0_4, 50, 14) + /* t0 = s[15], s[15] = t1 >>> 37 */ + SWAP_ROTL(T_1, TR_1, T_0, S3_0, SR3_0, 37, 27) + /* t1 = s[23], s[23] = t0 >>> 23 */ + SWAP_ROTL_MEM(T_0, TR_0, T_1, T_3, 184, 23, 41) + /* t0 = s[19], s[19] = t1 >>> 8 */ + SWAP_ROTL(T_1, TR_1, T_0, S3_4, SR3_4, 8, 56) + /* t1 = s[13], s[13] = t0 >>> 56 */ + SWAP_ROTL(T_0, TR_0, T_1, S2_3, SR2_3, 56, 8) + /* t0 = s[12], s[12] = t1 >>> 39 */ + SWAP_ROTL(T_1, TR_1, T_0, S2_2, SR2_2, 39, 25) + /* t1 = s[ 2], s[ 2] = t0 >>> 21 */ + SWAP_ROTL(T_0, TR_0, T_1, S0_2, SR0_2, 21, 43) + /* t0 = s[20], s[20] = t1 >>> 2 */ + SWAP_ROTL_MEM(T_1, TR_1, T_0, T_3, 160, 2, 62) + /* t1 = s[14], s[14] = t0 >>> 46 */ + SWAP_ROTL(T_0, TR_0, T_1, S2_4, SR2_4, 46, 18) + /* t0 = s[22], s[22] = t1 >>> 25 */ + SWAP_ROTL(T_1, TR_1, T_0, T_2, TR_2, 25, 39) + /* t1 = s[ 9], s[ 9] = t0 >>> 3 */ + SWAP_ROTL(T_0, TR_0, T_1, S1_4, SR1_4, 3, 61) + /* t0 = s[ 6], s[ 6] = t1 >>> 44 */ + SWAP_ROTL(T_1, TR_1, T_0, S1_1, SR1_1, 44, 20) + /* s[ 1] = t0 >>> 20 */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "srli " S0_1 ", " T_0 ", 20\n\t" + "slli " T_0 ", " T_0 ", 44\n\t" + "or " S0_1 ", " S0_1 ", " T_0 "\n\t" +#else + RORI(SR0_1, TR_0, 20) +#endif + + /* ROW MIX */ + /* s[0] */ + "mv " T_0 ", " S0_0 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S0_1 "\n\t" + "and " T_3 ", " T_3 ", " S0_2 "\n\t" +#else + ANDN(TR_3, SR0_2, SR0_1) +#endif + "xor " S0_0 ", " S0_0 ", " T_3 "\n\t" + /* s[1] */ + "mv " T_1 ", " S0_1 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S0_2 "\n\t" + "and " T_3 ", " T_3 ", " S0_3 "\n\t" +#else + ANDN(TR_3, SR0_3, SR0_2) +#endif + "xor " S0_1 ", " S0_1 ", " T_3 "\n\t" + /* s[2] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S0_3 "\n\t" + "and " T_3 ", " T_3 ", " S0_4 "\n\t" +#else + ANDN(TR_3, SR0_4, SR0_3) +#endif + "xor " S0_2 ", " S0_2 ", " T_3 "\n\t" + /* s[3] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S0_4 "\n\t" + "and " T_3 ", " T_3 ", " T_0 "\n\t" +#else + ANDN(TR_3, TR_0, SR0_4) +#endif + "xor " S0_3 ", " S0_3 ", " T_3 "\n\t" + /* s[4] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " T_0 "\n\t" + "and " T_3 ", " T_3 ", " T_1 "\n\t" +#else + ANDN(TR_3, TR_1, TR_0) +#endif + "xor " S0_4 ", " S0_4 ", " T_3 "\n\t" + + /* s[5] */ + "mv " T_0 ", " S1_0 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S1_1 "\n\t" + "and " T_3 ", " T_3 ", " S1_2 "\n\t" +#else + ANDN(TR_3, SR1_2, SR1_1) +#endif + "xor " S1_0 ", " S1_0 ", " T_3 "\n\t" + /* s[6] */ + "mv " T_1 ", " S1_1 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S1_2 "\n\t" + "and " T_3 ", " T_3 ", " S1_3 "\n\t" +#else + ANDN(TR_3, SR1_3, SR1_2) +#endif + "xor " S1_1 ", " S1_1 ", " T_3 "\n\t" + /* s[7] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S1_3 "\n\t" + "and " T_3 ", " T_3 ", " S1_4 "\n\t" +#else + ANDN(TR_3, SR1_4, SR1_3) +#endif + "xor " S1_2 ", " S1_2 ", " T_3 "\n\t" + /* s[8] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S1_4 "\n\t" + "and " T_3 ", " T_3 ", " T_0 "\n\t" +#else + ANDN(TR_3, TR_0, SR1_4) +#endif + "xor " S1_3 ", " S1_3 ", " T_3 "\n\t" + /* s[9] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " T_0 "\n\t" + "and " T_3 ", " T_3 ", " T_1 "\n\t" +#else + ANDN(TR_3, TR_1, TR_0) +#endif + "xor " S1_4 ", " S1_4 ", " T_3 "\n\t" + + /* s[10] */ + "mv " T_0 ", " S2_0 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S2_1 "\n\t" + "and " T_3 ", " T_3 ", " S2_2 "\n\t" +#else + ANDN(TR_3, SR2_2, SR2_1) +#endif + "xor " S2_0 ", " S2_0 ", " T_3 "\n\t" + /* s[11] */ + "mv " T_1 ", " S2_1 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S2_2 "\n\t" + "and " T_3 ", " T_3 ", " S2_3 "\n\t" +#else + ANDN(TR_3, SR2_3, SR2_2) +#endif + "xor " S2_1 ", " S2_1 ", " T_3 "\n\t" + /* s[12] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S2_3 "\n\t" + "and " T_3 ", " T_3 ", " S2_4 "\n\t" +#else + ANDN(TR_3, SR2_4, SR2_3) +#endif + "xor " S2_2 ", " S2_2 ", " T_3 "\n\t" + /* s[13] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S2_4 "\n\t" + "and " T_3 ", " T_3 ", " T_0 "\n\t" +#else + ANDN(TR_3, TR_0, SR2_4) +#endif + "xor " S2_3 ", " S2_3 ", " T_3 "\n\t" + /* s[14] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " T_0 "\n\t" + "and " T_3 ", " T_3 ", " T_1 "\n\t" +#else + ANDN(TR_3, TR_1, TR_0) +#endif + "xor " S2_4 ", " S2_4 ", " T_3 "\n\t" + + /* s[15] */ + "mv " T_0 ", " S3_0 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S3_1 "\n\t" + "and " T_3 ", " T_3 ", " S3_2 "\n\t" +#else + ANDN(TR_3, SR3_2, SR3_1) +#endif + "xor " S3_0 ", " S3_0 ", " T_3 "\n\t" + /* s[16] */ + "mv " T_1 ", " S3_1 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S3_2 "\n\t" + "and " T_3 ", " T_3 ", " S3_3 "\n\t" +#else + ANDN(TR_3, SR3_3, SR3_2) +#endif + "xor " S3_1 ", " S3_1 ", " T_3 "\n\t" + /* s[17] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S3_3 "\n\t" + "and " T_3 ", " T_3 ", " S3_4 "\n\t" +#else + ANDN(TR_3, SR3_4, SR3_3) +#endif + "xor " S3_2 ", " S3_2 ", " T_3 "\n\t" + /* s[18] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " S3_4 "\n\t" + "and " T_3 ", " T_3 ", " T_0 "\n\t" +#else + ANDN(TR_3, TR_0, SR3_4) +#endif + "xor " S3_3 ", " S3_3 ", " T_3 "\n\t" + /* s[19] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " T_3 ", " T_0 "\n\t" + "and " T_3 ", " T_3 ", " T_1 "\n\t" +#else + ANDN(TR_3, TR_1, TR_0) +#endif + "xor " S3_4 ", " S3_4 ", " T_3 "\n\t" + + "sd " S3_0 ", 120(%[s])\n\t" + "sd " S3_1 ", 128(%[s])\n\t" + "sd " S3_2 ", 136(%[s])\n\t" + "ld " T_0 ", 160(%[s])\n\t" + "ld " T_1 ", 168(%[s])\n\t" + "ld " T_3 ", 184(%[s])\n\t" + + /* s[20] */ + "mv " S3_0 ", " T_0 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " S3_2 ", " T_1 "\n\t" + "and " S3_2 ", " S3_2 ", " T_2 "\n\t" +#else + ANDN(SR3_2, TR_2, TR_1) +#endif + "xor " T_0 ", " T_0 ", " S3_2 "\n\t" + /* s[21] */ + "mv " S3_1 ", " T_1 "\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " S3_2 ", " T_2 "\n\t" + "and " S3_2 ", " S3_2 ", " T_3 "\n\t" +#else + ANDN(SR3_2, TR_3, TR_2) +#endif + "xor " T_1 ", " T_1 ", " S3_2 "\n\t" + /* s[22] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " S3_2 ", " T_3 "\n\t" + "and " S3_2 ", " S3_2 ", " T_4 "\n\t" +#else + ANDN(SR3_2, TR_4, TR_3) +#endif + "xor " T_2 ", " T_2 ", " S3_2 "\n\t" + /* s[23] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " S3_2 ", " T_4 "\n\t" + "and " S3_2 ", " S3_2 ", " S3_0 "\n\t" +#else + ANDN(SR3_2, SR3_0, TR_4) +#endif + "xor " T_3 ", " T_3 ", " S3_2 "\n\t" + /* s[24] */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + "not " S3_2 ", " S3_0 "\n\t" + "and " S3_2 ", " S3_2 ", " S3_1 "\n\t" +#else + ANDN(SR3_2, SR3_1, SR3_0) +#endif + "xor " T_4 ", " T_4 ", " S3_2 "\n\t" + + "ld " S3_0 ", 120(%[s])\n\t" + "ld " S3_1 ", 128(%[s])\n\t" + "ld " S3_2 ", 136(%[s])\n\t" + "sd " T_0 ", 160(%[s])\n\t" + "sd " T_1 ", 168(%[s])\n\t" + "sd " T_2 ", 176(%[s])\n\t" + "sd " T_3 ", 184(%[s])\n\t" + "sd " T_4 ", 192(%[s])\n\t" + + "ld " T_4 ", 16(sp)\n\t" + "ld " T_3 ", 0(%[r])\n\t" + "addi %[r], %[r], 8\n\t" + "addi " T_4 ", " T_4 ", -1\n\t" + "xor " S0_0 ", " S0_0 ", " T_3 "\n\t" + "bnez " T_4 ", L_riscv_64_block_sha3_loop\n\t" + + "sd " S0_0 ", 0(%[s])\n\t" + "sd " S0_1 ", 8(%[s])\n\t" + "sd " S0_2 ", 16(%[s])\n\t" + "sd " S0_3 ", 24(%[s])\n\t" + "sd " S0_4 ", 32(%[s])\n\t" + "sd " S1_0 ", 40(%[s])\n\t" + "sd " S1_1 ", 48(%[s])\n\t" + "sd " S1_2 ", 56(%[s])\n\t" + "sd " S1_3 ", 64(%[s])\n\t" + "sd " S1_4 ", 72(%[s])\n\t" + "sd " S2_0 ", 80(%[s])\n\t" + "sd " S2_1 ", 88(%[s])\n\t" + "sd " S2_2 ", 96(%[s])\n\t" + "sd " S2_3 ", 104(%[s])\n\t" + "sd " S2_4 ", 112(%[s])\n\t" + "sd " S3_0 ", 120(%[s])\n\t" + "sd " S3_1 ", 128(%[s])\n\t" + "sd " S3_2 ", 136(%[s])\n\t" + "sd " S3_3 ", 144(%[s])\n\t" + "sd " S3_4 ", 152(%[s])\n\t" + + "addi sp, sp, 24\n\t" + + : [r] "+r" (r) + : [s] "r" (s) + : "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "a1", "a2", "a3", "a4", "a5", "a6", "a7", + "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11" + ); +} + +#else + +#ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION + +#define COL_MIX(r, b1, b4) \ + VSLL_VI(REG_V31, b1, 1) \ + VSRL_VX(REG_V30, b1, REG_T1) \ + VXOR_VV(REG_V31, REG_V31, b4) \ + VXOR_VV(REG_V31, REG_V31, REG_V30) \ + VXOR_VV((r + 0), (r + 0), REG_V31) \ + VXOR_VV((r + 5), (r + 5), REG_V31) \ + VXOR_VV((r + 10), (r + 10), REG_V31) \ + VXOR_VV((r + 15), (r + 15), REG_V31) \ + VXOR_VV((r + 20), (r + 20), REG_V31) + +#define SWAP_ROTL_LO(vr, vt0, vt1, sl) \ + VMV_V_V(vt0, vr) \ + "li t1, 64 - " #sl "\n\t" \ + VSLL_VI(vr, vt1, sl) \ + VSRL_VX(vt1, vt1, REG_T1) \ + VOR_VV(vr, vr, vt1) + +#define SWAP_ROTL_HI(vr, vt0, vt1, sl) \ + VMV_V_V(vt0, vr) \ + "li t1, " #sl "\n\t" \ + VSRL_VI(vr, vt1, (64 - sl)) \ + VSLL_VX(vt1, vt1, REG_T1) \ + VOR_VV(vr, vr, vt1) + +#define ROW_MIX(r) \ + VMV_V_V(REG_V25, (r + 0)) \ + VMV_V_V(REG_V26, (r + 1)) \ + VNOT_V(REG_V30, (r + 1)) \ + VNOT_V(REG_V31, (r + 2)) \ + VAND_VV(REG_V30, REG_V30, (r + 2)) \ + VAND_VV(REG_V31, REG_V31, (r + 3)) \ + VXOR_VV((r + 0), REG_V30, (r + 0)) \ + VXOR_VV((r + 1), REG_V31, (r + 1)) \ + VNOT_V(REG_V30, (r + 3)) \ + VNOT_V(REG_V31, (r + 4)) \ + VAND_VV(REG_V30, REG_V30, (r + 4)) \ + VAND_VV(REG_V31, REG_V31, REG_V25) \ + VNOT_V(REG_V25, REG_V25) \ + VXOR_VV((r + 2), REG_V30, (r + 2)) \ + VAND_VV(REG_V25, REG_V25, REG_V26) \ + VXOR_VV((r + 3), REG_V31, (r + 3)) \ + VXOR_VV((r + 4), REG_V25, (r + 4)) + +#else + +#define COL_MIX(r, t) \ + VXOR_VV((r + 0), (r + 0), t) \ + VXOR_VV((r + 5), (r + 5), t) \ + VXOR_VV((r + 10), (r + 10), t) \ + VXOR_VV((r + 15), (r + 15), t) \ + VXOR_VV((r + 20), (r + 20), t) + +#define SWAP_ROTL(vr, vt0, vt1, sl) \ + VMV_V_V(vt0, vr) \ + VROR_VI(vr, (64 - sl), vt1) + +#define SWAP_ROTL_LO SWAP_ROTL +#define SWAP_ROTL_HI SWAP_ROTL + +#define ROW_MIX(r) \ + VMV_V_V(REG_V25, (r + 0)) \ + VMV_V_V(REG_V26, (r + 1)) \ + VANDN_VV(REG_V30, (r + 1), (r + 2)) \ + VANDN_VV(REG_V31, (r + 2), (r + 3)) \ + VXOR_VV((r + 0), REG_V30, (r + 0)) \ + VXOR_VV((r + 1), REG_V31, (r + 1)) \ + VANDN_VV(REG_V30, (r + 3), (r + 4)) \ + VANDN_VV(REG_V31, (r + 4), REG_V25) \ + VANDN_VV(REG_V25, REG_V25, REG_V26) \ + VXOR_VV((r + 2), REG_V30, (r + 2)) \ + VXOR_VV((r + 3), REG_V31, (r + 3)) \ + VXOR_VV((r + 4), REG_V25, (r + 4)) + +#endif + + +void BlockSha3(word64* s) +{ + __asm__ __volatile__ ( + /* 1 x 64-bit */ + VSETIVLI(REG_X0, 1, 0, 1, 0b011, 0b000) + + "li t2, 24\n\t" + "mv t0, %[r]\n\t" + "mv t1, %[s]\n\t" + VLSEG8E64_V(REG_V0, REG_T1) + "addi t1, %[s], 64\n\t" + VLSEG8E64_V(REG_V8, REG_T1) + "addi t1, %[s], 128\n\t" + VLSEG8E64_V(REG_V16, REG_T1) + "addi t1, %[s], 192\n\t" + VLSEG1E64_V(REG_V24, REG_T1) + + "\n" + "L_riscv_64_block_sha3_loop:\n\t" + + /* COLUMN MIX */ + VXOR_VV(REG_V25, REG_V0, REG_V5) + VXOR_VV(REG_V26, REG_V1, REG_V6) + VXOR_VV(REG_V27, REG_V2, REG_V7) + VXOR_VV(REG_V28, REG_V3, REG_V8) + VXOR_VV(REG_V29, REG_V4, REG_V9) + VXOR_VV(REG_V25, REG_V25, REG_V10) + VXOR_VV(REG_V26, REG_V26, REG_V11) + VXOR_VV(REG_V27, REG_V27, REG_V12) + VXOR_VV(REG_V28, REG_V28, REG_V13) + VXOR_VV(REG_V29, REG_V29, REG_V14) + VXOR_VV(REG_V25, REG_V25, REG_V15) + VXOR_VV(REG_V26, REG_V26, REG_V16) + VXOR_VV(REG_V27, REG_V27, REG_V17) + VXOR_VV(REG_V28, REG_V28, REG_V18) + VXOR_VV(REG_V29, REG_V29, REG_V19) + VXOR_VV(REG_V25, REG_V25, REG_V20) + VXOR_VV(REG_V26, REG_V26, REG_V21) + VXOR_VV(REG_V27, REG_V27, REG_V22) + VXOR_VV(REG_V28, REG_V28, REG_V23) + VXOR_VV(REG_V29, REG_V29, REG_V24) + +#ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION + "li t1, 63\n\t" + COL_MIX(REG_V0, REG_V26, REG_V29) + COL_MIX(REG_V1, REG_V27, REG_V25) + COL_MIX(REG_V2, REG_V28, REG_V26) + COL_MIX(REG_V3, REG_V29, REG_V27) + COL_MIX(REG_V4, REG_V25, REG_V28) +#else + VROR_VI(REG_V30, 63, REG_V26) + VROR_VI(REG_V31, 63, REG_V27) + VXOR_VV(REG_V30, REG_V30, REG_V29) + VXOR_VV(REG_V31, REG_V31, REG_V25) + COL_MIX(REG_V0, REG_V30) + COL_MIX(REG_V1, REG_V31) + + VROR_VI(REG_V30, 63, REG_V28) + VROR_VI(REG_V31, 63, REG_V29) + VROR_VI(REG_V25, 63, REG_V25) + VXOR_VV(REG_V30, REG_V30, REG_V26) + VXOR_VV(REG_V31, REG_V31, REG_V27) + VXOR_VV(REG_V25, REG_V25, REG_V28) + COL_MIX(REG_V2, REG_V30) + COL_MIX(REG_V3, REG_V31) + COL_MIX(REG_V4, REG_V25) +#endif + /* SWAP ROTL */ + /* t1 = s[ 1] */ + VMV_V_V(REG_V26, REG_V1) + /* t0 = s[10], s[10] = t1 <<< 1 */ + SWAP_ROTL_LO(REG_V10, REG_V25, REG_V26, 1) + /* t1 = s[ 7], s[ 7] = t0 <<< 3 */ + SWAP_ROTL_LO(REG_V7 , REG_V26, REG_V25, 3) + /* t0 = s[11], s[11] = t1 <<< 6 */ + SWAP_ROTL_LO(REG_V11, REG_V25, REG_V26, 6) + /* t1 = s[17], s[17] = t0 <<< 10 */ + SWAP_ROTL_LO(REG_V17, REG_V26, REG_V25, 10) + /* t0 = s[18], s[18] = t1 <<< 15 */ + SWAP_ROTL_LO(REG_V18, REG_V25, REG_V26, 15) + /* t1 = s[ 3], s[ 3] = t0 <<< 21 */ + SWAP_ROTL_LO(REG_V3 , REG_V26, REG_V25, 21) + /* t0 = s[ 5], s[ 5] = t1 <<< 28 */ + SWAP_ROTL_LO(REG_V5 , REG_V25, REG_V26, 28) + /* t1 = s[16], s[16] = t0 <<< 36 */ + SWAP_ROTL_HI(REG_V16, REG_V26, REG_V25, 36) + /* t0 = s[ 8], s[ 8] = t1 <<< 45 */ + SWAP_ROTL_HI(REG_V8 , REG_V25, REG_V26, 45) + /* t1 = s[21], s[21] = t0 <<< 55 */ + SWAP_ROTL_HI(REG_V21, REG_V26, REG_V25, 55) + /* t0 = s[24], s[24] = t1 <<< 2 */ + SWAP_ROTL_LO(REG_V24, REG_V25, REG_V26, 2) + /* t1 = s[ 4], s[ 4] = t0 <<< 14 */ + SWAP_ROTL_LO(REG_V4 , REG_V26, REG_V25, 14) + /* t0 = s[15], s[15] = t1 <<< 27 */ + SWAP_ROTL_LO(REG_V15, REG_V25, REG_V26, 27) + /* t1 = s[23], s[23] = t0 <<< 41 */ + SWAP_ROTL_HI(REG_V23, REG_V26, REG_V25, 41) + /* t0 = s[19], s[19] = t1 <<< 56 */ + SWAP_ROTL_HI(REG_V19, REG_V25, REG_V26, 56) + /* t1 = s[13], s[13] = t0 <<< 8 */ + SWAP_ROTL_LO(REG_V13, REG_V26, REG_V25, 8) + /* t0 = s[12], s[12] = t1 <<< 25 */ + SWAP_ROTL_LO(REG_V12, REG_V25, REG_V26, 25) + /* t1 = s[ 2], s[ 2] = t0 <<< 43 */ + SWAP_ROTL_HI(REG_V2 , REG_V26, REG_V25, 43) + /* t0 = s[20], s[20] = t1 <<< 62 */ + SWAP_ROTL_HI(REG_V20, REG_V25, REG_V26, 62) + /* t1 = s[14], s[14] = t0 <<< 18 */ + SWAP_ROTL_LO(REG_V14, REG_V26, REG_V25, 18) + /* t0 = s[22], s[22] = t1 <<< 39 */ + SWAP_ROTL_HI(REG_V22, REG_V25, REG_V26, 39) + /* t1 = s[ 9], s[ 9] = t0 <<< 61 */ + SWAP_ROTL_HI(REG_V9 , REG_V26, REG_V25, 61) + /* t0 = s[ 6], s[ 6] = t1 <<< 20 */ + SWAP_ROTL_LO(REG_V6 , REG_V25, REG_V26, 20) + /* s[ 1] = t0 <<< 44 */ + "li t1, 44\n\t" + VSRL_VI(REG_V1, REG_V25, (64 - 44)) + VSLL_VX(REG_V25, REG_V25, REG_T1) + VOR_VV(REG_V1, REG_V1, REG_V25) + + /* ROW MIX */ + ROW_MIX(REG_V0) + ROW_MIX(REG_V5) + ROW_MIX(REG_V10) + ROW_MIX(REG_V15) + ROW_MIX(REG_V20) + + VL1RE64_V(REG_V25, REG_T0) + "addi t0, t0, 8\n\t" + "addi t2, t2, -1\n\t" + VXOR_VV(REG_V0, REG_V0, REG_V25) + "bnez t2, L_riscv_64_block_sha3_loop\n\t" + + "mv t1, %[s]\n\t" + VSSEG8E64_V(REG_V0, REG_T1) + "addi t1, %[s], 64\n\t" + VSSEG8E64_V(REG_V8, REG_T1) + "addi t1, %[s], 128\n\t" + VSSEG8E64_V(REG_V16, REG_T1) + "addi t1, %[s], 192\n\t" + VSSEG1E64_V(REG_V24, REG_T1) + + : + : [s] "r" (s), [r] "r" (hash_keccak_r) + : "memory", "t0", "t1", "t2" + ); +} + +#endif /* WOLFSSL_RISCV_VECTOR */ +#endif /* WOLFSSL_SHA3 && !XILINX */ +#endif /* WOLFSSL_RISCV_ASM */ diff --git a/wolfcrypt/src/port/riscv/riscv-64-sha512.c b/wolfcrypt/src/port/riscv/riscv-64-sha512.c new file mode 100644 index 000000000..217761e9e --- /dev/null +++ b/wolfcrypt/src/port/riscv/riscv-64-sha512.c @@ -0,0 +1,1724 @@ +/* riscv-sha512.c + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#ifdef WOLFSSL_RISCV_ASM +#if !defined(NO_SHA512) || defined(WOLFSSL_SHA384) + +#if FIPS_VERSION3_LT(6,0,0) && defined(HAVE_FIPS) + #undef HAVE_FIPS +#else + #if defined(HAVE_FIPS) && FIPS_VERSION3_GE(6,0,0) + /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ + #define FIPS_NO_WRAPPERS + #endif +#endif + +#include +#if FIPS_VERSION3_GE(6,0,0) + const unsigned int wolfCrypt_FIPS_sha512_ro_sanity[2] = + { 0x1a2b3c4d, 0x00000014 }; + int wolfCrypt_FIPS_SHA512_sanity(void) + { + return 0; + } +#endif +#include +#include + +#include + +#ifdef NO_INLINE + #include +#else + #define WOLFSSL_MISC_INCLUDED + #include +#endif + +/* Constants to add in each round. */ +static const word64 K512[80] = { + W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), + W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), + W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), + W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), + W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), + W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), + W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), + W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), + W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), + W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), + W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), + W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), + W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), + W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), + W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), + W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), + W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), + W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), + W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), + W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), + W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), + W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), + W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), + W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), + W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), + W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), + W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), + W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), + W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), + W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), + W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), + W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), + W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), + W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), + W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), + W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), + W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), + W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), + W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), + W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) +}; + +static int InitSha512(wc_Sha512* sha512, void* heap, int devId) +{ + int ret = 0; + + if (sha512 == NULL) { + ret = BAD_FUNC_ARG; + } + + if (ret == 0) { + sha512->heap = heap; + #ifdef WOLF_CRYPTO_CB + sha512->devId = devId; + #endif + (void)devId; + #ifdef WOLFSSL_SMALL_STACK_CACHE + sha512->W = NULL; + #endif + + #ifdef WOLFSSL_HASH_FLAGS + sha512->flags = 0; + #endif + } + + return ret; +} + +/* Initialize SHA-512 object for hashing. + * + * @param [in, out] sha512 SHA-512 object. + */ +static void InitSha512_State(wc_Sha512* sha512) +{ + /* Set initial hash values. */ +#ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + sha512->digest[0] = W64LIT(0x6a09e667f3bcc908); + sha512->digest[1] = W64LIT(0xbb67ae8584caa73b); + sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b); + sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1); + sha512->digest[4] = W64LIT(0x510e527fade682d1); + sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f); + sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b); + sha512->digest[7] = W64LIT(0x5be0cd19137e2179); +#else + /* f, e, b, a, h, g, d, c */ + sha512->digest[0] = W64LIT(0x9b05688c2b3e6c1f); + sha512->digest[1] = W64LIT(0x510e527fade682d1); + sha512->digest[2] = W64LIT(0xbb67ae8584caa73b); + sha512->digest[3] = W64LIT(0x6a09e667f3bcc908); + sha512->digest[4] = W64LIT(0x5be0cd19137e2179); + sha512->digest[5] = W64LIT(0x1f83d9abfb41bd6b); + sha512->digest[6] = W64LIT(0xa54ff53a5f1d36f1); + sha512->digest[7] = W64LIT(0x3c6ef372fe94f82b); +#endif + + /* No hashed data. */ + sha512->buffLen = 0; + /* No data hashed. */ + sha512->loLen = 0; + sha512->hiLen = 0; +} + +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_224) +/** + * Initialize given wc_Sha512 structure with value specific to sha512/224. + * Note that sha512/224 has different initial hash value from sha512. + * The initial hash value consists of eight 64bit words. They are given + * in FIPS180-4. + */ +static void InitSha512_224_State(wc_Sha512* sha512) +{ +#ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + sha512->digest[0] = W64LIT(0x8c3d37c819544da2); + sha512->digest[1] = W64LIT(0x73e1996689dcd4d6); + sha512->digest[2] = W64LIT(0x1dfab7ae32ff9c82); + sha512->digest[3] = W64LIT(0x679dd514582f9fcf); + sha512->digest[4] = W64LIT(0x0f6d2b697bd44da8); + sha512->digest[5] = W64LIT(0x77e36f7304c48942); + sha512->digest[6] = W64LIT(0x3f9d85a86a1d36c8); + sha512->digest[7] = W64LIT(0x1112e6ad91d692a1); +#else + /* f, e, b, a, h, g, d, c */ + sha512->digest[0] = W64LIT(0x77e36f7304c48942); + sha512->digest[1] = W64LIT(0x0f6d2b697bd44da8); + sha512->digest[2] = W64LIT(0x73e1996689dcd4d6); + sha512->digest[3] = W64LIT(0x8c3d37c819544da2); + sha512->digest[4] = W64LIT(0x1112e6ad91d692a1); + sha512->digest[5] = W64LIT(0x3f9d85a86a1d36c8); + sha512->digest[6] = W64LIT(0x679dd514582f9fcf); + sha512->digest[7] = W64LIT(0x1dfab7ae32ff9c82); +#endif + + /* No hashed data. */ + sha512->buffLen = 0; + /* No data hashed. */ + sha512->loLen = 0; + sha512->hiLen = 0; +} +#endif /* !WOLFSSL_NOSHA512_224 */ +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_256) +/** + * Initialize given wc_Sha512 structure with value specific to sha512/256. + * Note that sha512/256 has different initial hash value from sha512. + * The initial hash value consists of eight 64bit words. They are given + * in FIPS180-4. + */ +static void InitSha512_256_State(wc_Sha512* sha512) +{ +#ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + sha512->digest[0] = W64LIT(0x22312194fc2bf72c); + sha512->digest[1] = W64LIT(0x9f555fa3c84c64c2); + sha512->digest[2] = W64LIT(0x2393b86b6f53b151); + sha512->digest[3] = W64LIT(0x963877195940eabd); + sha512->digest[4] = W64LIT(0x96283ee2a88effe3); + sha512->digest[5] = W64LIT(0xbe5e1e2553863992); + sha512->digest[6] = W64LIT(0x2b0199fc2c85b8aa); + sha512->digest[7] = W64LIT(0x0eb72ddc81c52ca2); +#else + /* f, e, b, a, h, g, d, c */ + sha512->digest[0] = W64LIT(0xbe5e1e2553863992); + sha512->digest[1] = W64LIT(0x96283ee2a88effe3); + sha512->digest[2] = W64LIT(0x9f555fa3c84c64c2); + sha512->digest[3] = W64LIT(0x22312194fc2bf72c); + sha512->digest[4] = W64LIT(0x0eb72ddc81c52ca2); + sha512->digest[5] = W64LIT(0x2b0199fc2c85b8aa); + sha512->digest[6] = W64LIT(0x963877195940eabd); + sha512->digest[7] = W64LIT(0x2393b86b6f53b151); +#endif + + /* No hashed data. */ + sha512->buffLen = 0; + /* No data hashed. */ + sha512->loLen = 0; + sha512->hiLen = 0; +} +#endif /* !WOLFSSL_NOSHA512_256 */ +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + +/* More data hashed, add length to 64-bit cumulative total. + * + * @param [in, out] sha512 SHA-512 object. Assumed not NULL. + * @param [in] len Length to add. + */ +static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len) +{ + word32 tmp = sha512->loLen; + if ((sha512->loLen += len) < tmp) + sha512->hiLen++; /* carry low to high */ +} + +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + +/* Load a word with bytes reversed. */ +#define LOAD_DWORD_REV(r, o, p, t0, t1, t2, t3) \ + "lbu " #t0 ", " #o "+4(" #p ")\n\t" \ + "lbu " #t1 ", " #o "+5(" #p ")\n\t" \ + "lbu " #t2 ", " #o "+6(" #p ")\n\t" \ + "lbu " #r ", " #o "+7(" #p ")\n\t" \ + "slli " #t0 ", " #t0 ", 24\n\t" \ + "slli " #t1 ", " #t1 ", 16\n\t" \ + "slli " #t2 ", " #t2 ", 8\n\t" \ + "or " #r ", " #r ", " #t0 "\n\t" \ + "or " #r ", " #r ", " #t1 "\n\t" \ + "or " #r ", " #r ", " #t2 "\n\t" \ + "lbu " #t0 ", " #o "+0(" #p ")\n\t" \ + "lbu " #t1 ", " #o "+1(" #p ")\n\t" \ + "lbu " #t2 ", " #o "+2(" #p ")\n\t" \ + "lbu " #t3 ", " #o "+3(" #p ")\n\t" \ + "slli " #t0 ", " #t0 ", 56\n\t" \ + "slli " #t1 ", " #t1 ", 48\n\t" \ + "slli " #t2 ", " #t2 ", 40\n\t" \ + "slli " #t3 ", " #t3 ", 32\n\t" \ + "or " #r ", " #r ", " #t0 "\n\t" \ + "or " #r ", " #r ", " #t1 "\n\t" \ + "or " #r ", " #r ", " #t2 "\n\t" \ + "or " #r ", " #r ", " #t3 "\n\t" + +#endif + +#ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + +#ifdef WOLFSSL_RISCV_SCALAR_CRYPTO_ASM + +/* SHA-512 SUM0 operation. */ +#define SHA512SUM0(rd, rs1) \ + ASM_WORD((0b000100000100 << 20) | (0b001 << 12) | 0b0010011 | \ + (rs1 << 15) | (rd << 7)) +/* SHA-512 SUM1 operation. */ +#define SHA512SUM1(rd, rs1) \ + ASM_WORD((0b000100000101 << 20) | (0b001 << 12) | 0b0010011 | \ + (rs1 << 15) | (rd << 7)) +/* SHA-512 SIGMA0 operation. */ +#define SHA512SIG0(rd, rs1) \ + ASM_WORD((0b000100000110 << 20) | (0b001 << 12) | 0b0010011 | \ + (rs1 << 15) | (rd << 7)) +/* SHA-512 SIGMA1 operation. */ +#define SHA512SIG1(rd, rs1) \ + ASM_WORD((0b000100000111 << 20) | (0b001 << 12) | 0b0010011 | \ + (rs1 << 15) | (rd << 7)) + +/* One round of compression. */ +#define RND(a, b, c, d, e, f, g, h, w, k) \ + /* Get e and a */ \ + "mv a4, " #e "\n\t" \ + "mv a5, " #a "\n\t" \ + /* Sigma1(e) */ \ + SHA512SUM1(REG_A4, REG_A4) \ + /* Sigma0(a) */ \ + SHA512SUM0(REG_A5, REG_A5) \ + /* Maj(a, b, c) = t5 */ \ + /* Ch(e, f, g) = t6 */ \ + /* a ^ b */ \ + "xor t4, " #a ", " #b "\n\t" \ + /* f ^ g */ \ + "xor t6, " #f ", " #g "\n\t" \ + /* b ^ c */ \ + "xor t5, " #b ", " #c "\n\t" \ + /* (f ^ g) & e */ \ + "and t6, t6, " #e "\n\t" \ + /* (a^b) & (b^c) */ \ + "and t5, t5, t4\n\t" \ + /* ((f ^ g) & e) ^ g */ \ + "xor t6, t6, " #g "\n\t" \ + /* ((a^b) & (b^c)) ^ b */ \ + "xor t5, t5, " #b "\n\t" \ + /* sigma1 + Ch */ \ + "add t4, a4, t6\n\t" \ + /* K + W */ \ + "add t6, " #k ", " #w "\n\t" \ + /* sigma1 + Ch + K + W = 't0'-h */ \ + "add t4, t4, t6\n\t" \ + /* h + sigma1 + Ch + K + W = 't0' = h */ \ + "add " #h ", " #h ", t4\n\t" \ + /* Sigma0(a) + Maj = 't1' */ \ + "add t5, a5, t5\n\t" \ + /* d += 't0' */ \ + "add " #d ", " #d ", " #h "\n\t" \ + /* h += 't1' */ \ + "add " #h ", " #h ", t5\n\t" + +#define W_UPDATE(w0, w1, w9, w14, reg_w0, reg_w1, reg_w9, reg_w14) \ + /* Gamma0(W[1]) */ \ + SHA512SIG0(REG_A4, reg_w1) \ + /* Gamma1(W[i-2]) = Gamma1(W[14]) */ \ + SHA512SIG1(REG_A5, reg_w14) \ + /* Gamma1(W[14]) + W[9] */ \ + "add a5, a5, " #w9 "\n\t" \ + /* Gamma0(W[1]) + W[i-16] = Gamma0(W[1]) + W[0] */ \ + "add " #w0 ", " #w0 ", a4\n\t" \ + /* W[0] = Gamma1(W[14]) + W[9] + Gamma0(W[1]) + W[0] */ \ + "add " #w0 ", a5, " #w0 "\n\t" + +#else + +/* SHA-512 SUM0 operation. */ +#define SHA512SUM0(rd, rs1) \ + "slli t5, " #rs1 ", 36\n\t" \ + "srli t4, " #rs1 ", 28\n\t" \ + "slli t6, " #rs1 ", 30\n\t" \ + "or t4, t4, t5\n\t" \ + "srli t5, " #rs1 ", 34\n\t" \ + "xor t4, t4, t6\n\t" \ + "slli t6, " #rs1 ", 25\n\t" \ + "xor t4, t4, t5\n\t" \ + "srli " #rd ", " #rs1 ", 39\n\t" \ + "xor t4, t4, t6\n\t" \ + "xor " #rd ", " #rd ", t4\n\t" + +/* SHA-512 SUM1 operation. */ +#define SHA512SUM1(rd, rs1) \ + "slli t5, " #rs1 ", 50\n\t" \ + "srli t4, " #rs1 ", 14\n\t" \ + "slli t6, " #rs1 ", 46\n\t" \ + "or t4, t4, t5\n\t" \ + "srli t5, " #rs1 ", 18\n\t" \ + "xor t4, t4, t6\n\t" \ + "slli t6, " #rs1 ", 23\n\t" \ + "xor t4, t4, t5\n\t" \ + "srli " #rd ", " #rs1 ", 41\n\t" \ + "xor t4, t4, t6\n\t" \ + "xor " #rd ", " #rd ", t4\n\t" + +/* SHA-512 SIGMA0 operation. */ +#define SHA512SIG0(rd, rs1) \ + "slli t5, " #rs1 ", 63\n\t" \ + "srli t6, " #rs1 ", 1\n\t" \ + "slli t4, " #rs1 ", 56\n\t" \ + "or t6, t6, t5\n\t" \ + "srli t5, " #rs1 ", 8\n\t" \ + "xor t6, t6, t4\n\t" \ + "srli " #rd ", " #rs1 ", 7\n\t" \ + "xor t6, t6, t5\n\t" \ + "xor " #rd ", " #rd ", t6\n\t" + +/* SHA-512 SIGMA1 operation. */ +#define SHA512SIG1(rd, rs1) \ + "slli t5, " #rs1 ", 45\n\t" \ + "srli t6, " #rs1 ", 19\n\t" \ + "slli t4, " #rs1 ", 3\n\t" \ + "or t6, t6, t5\n\t" \ + "srli t5, " #rs1 ", 61\n\t" \ + "xor t6, t6, t4\n\t" \ + "srli " #rd ", " #rs1 ", 6\n\t" \ + "xor t6, t6, t5\n\t" \ + "xor " #rd ", " #rd ", t6\n\t" + +/* One round of compression. */ +#define RND(a, b, c, d, e, f, g, h, w, k) \ + /* Sigma1(e) */ \ + SHA512SUM1(a4, e) \ + /* Sigma0(a) */ \ + SHA512SUM0(a5, a) \ + /* Maj(a, b, c) = t5 */ \ + /* Ch(e, f, g) = t6 */ \ + /* a ^ b */ \ + "xor t4, " #a ", " #b "\n\t" \ + /* f ^ g */ \ + "xor t6, " #f ", " #g "\n\t" \ + /* b ^ c */ \ + "xor t5, " #b ", " #c "\n\t" \ + /* (f ^ g) & e */ \ + "and t6, t6, " #e "\n\t" \ + /* (a^b) & (b^c) */ \ + "and t5, t5, t4\n\t" \ + /* ((f ^ g) & e) ^ g */ \ + "xor t6, t6, " #g "\n\t" \ + /* ((a^b) & (b^c)) ^ b */ \ + "xor t5, t5, " #b "\n\t" \ + /* sigma1 + Ch */ \ + "add t4, a4, t6\n\t" \ + /* K + W */ \ + "add t6, " #k ", " #w "\n\t" \ + /* sigma1 + Ch + K + W = 't0'-h */ \ + "add t4, t4, t6\n\t" \ + /* h + sigma1 + Ch + K + W = 't0' = h */ \ + "add " #h ", " #h ", t4\n\t" \ + /* Sigma0(a) + Maj = 't1' */ \ + "add t5, a5, t5\n\t" \ + /* d += 't0' */ \ + "add " #d ", " #d ", " #h "\n\t" \ + /* h += 't1' */ \ + "add " #h ", " #h ", t5\n\t" + +/* Two message schedule updates. */ +#define W_UPDATE(w0, w1, w9, w14, reg_w0, reg_w1, reg_w9, reg_14) \ + /* Gamma0(W[1]) */ \ + SHA512SIG0(a4, w1) \ + /* Gamma1(W[i-2]) = Gamma1(W[14]) */ \ + SHA512SIG1(a5, w14) \ + /* Gamma1(W[14]) + W[9] */ \ + "add a5, a5, " #w9 "\n\t" \ + /* Gamma0(W[1]) + W[i-16] = Gamma0(W[1]) + W[0] */ \ + "add " #w0 ", " #w0 ", a4\n\t" \ + /* W[0] = Gamma1(W[14]) + W[9] + Gamma0(W[1]) + W[0] */ \ + "add " #w0 ", a5, " #w0 "\n\t" + + +#endif /* WOLFSSL_RISCV_SCALAR_CRYPTO_ASM */ + +#define RND2_W(a, b, c, d, e, f, g, h, o, w2o, w9o, w10o) \ + /* Get k[i] */ \ + "ld a6, " #o "(%[k])\n\t" \ + /* Get k[i+1] */ \ + "ld a7, " #o "+8(%[k])\n\t" \ + RND(a, b, c, d, e, f, g, h, s1, a6) \ + /* Get W[1] */ \ + "ld s2, " #o "+8(sp)\n\t" \ + /* Get W[9] */ \ + "ld s3, " #w9o "(sp)\n\t" \ + W_UPDATE(s1, s2, s3, s4, REG_S1, REG_S2, REG_S3, REG_S4) \ + RND(h, a, b, c, d, e, f, g, s2, a7) \ + "mv s4, s1\n\t" \ + /* Get W[2] */ \ + "ld s1, " #w2o "(sp)\n\t" \ + /* Get W[10] */ \ + "ld s3, " #w10o "(sp)\n\t" \ + W_UPDATE(s2, s1, s3, s5, REG_S2, REG_S1, REG_S3, REG_S5) \ + "sd s4, " #o "(sp)\n\t" \ + "mv s5, s2\n\t" \ + "sd s2, " #o "+8(sp)\n\t" + +/* Sixteen rounds of compression with message scheduling. */ +#define RND16() \ + RND2_W(t0, t1, t2, t3, s8, s9, s10, s11, 0, 16, 72, 80) \ + RND2_W(s10, s11, t0, t1, t2, t3, s8, s9, 16, 32, 88, 96) \ + RND2_W(s8, s9, s10, s11, t0, t1, t2, t3, 32, 48, 104, 112) \ + RND2_W(t2, t3, s8, s9, s10, s11, t0, t1, 48, 64, 120, 0) \ + RND2_W(t0, t1, t2, t3, s8, s9, s10, s11, 64, 80, 8, 16) \ + RND2_W(s10, s11, t0, t1, t2, t3, s8, s9, 80, 96, 24, 32) \ + RND2_W(s8, s9, s10, s11, t0, t1, t2, t3, 96, 112, 40, 48) \ + RND2_W(t2, t3, s8, s9, s10, s11, t0, t1, 112, 0, 56, 64) + +#define RND2(a, b, c, d, e, f, g, h, o) \ + /* Get k[i] */ \ + "ld a6, " #o "(%[k])\n\t" \ + /* Get W[0] */ \ + "ld s1, " #o "(sp)\n\t" \ + RND(a, b, c, d, e, f, g, h, s1, a6) \ + /* Get k[i] */ \ + "ld a6, " #o "+8(%[k])\n\t" \ + /* Get W[1] */ \ + "ld s1, " #o "+8(sp)\n\t" \ + RND(h, a, b, c, d, e, f, g, s1, a6) + +/* Sixteen rounds of compression only. */ +#define RND16_LAST() \ + RND2(t0, t1, t2, t3, s8, s9, s10, s11, 0) \ + RND2(s10, s11, t0, t1, t2, t3, s8, s9, 16) \ + RND2(s8, s9, s10, s11, t0, t1, t2, t3, 32) \ + RND2(t2, t3, s8, s9, s10, s11, t0, t1, 48) \ + RND2(t0, t1, t2, t3, s8, s9, s10, s11, 64) \ + RND2(s10, s11, t0, t1, t2, t3, s8, s9, 80) \ + RND2(s8, s9, s10, s11, t0, t1, t2, t3, 96) \ + RND2(t2, t3, s8, s9, s10, s11, t0, t1, 112) + +/* Transform the message data. + * + * @param [in, out] sha512 SHA-512 object. + * @param [in] data Buffer of data to hash. + * @param [in] blocks Number of blocks of data to hash. + */ +static WC_INLINE void Sha512Transform(wc_Sha512* sha512, const byte* data, + word32 blocks) +{ + word64* k = (word64*)K512; + + __asm__ __volatile__ ( + "addi sp, sp, -128\n\t" + + /* Load digest. */ + "ld t0, 0(%[digest])\n\t" + "ld t1, 8(%[digest])\n\t" + "ld t2, 16(%[digest])\n\t" + "ld t3, 24(%[digest])\n\t" + "ld s8, 32(%[digest])\n\t" + "ld s9, 40(%[digest])\n\t" + "ld s10, 48(%[digest])\n\t" + "ld s11, 56(%[digest])\n\t" + + /* 5 rounds of 16 per block - 4 loops of 16 and 1 final 16. */ + "slli %[blocks], %[blocks], 2\n\t" + + "\n1:\n\t" + /* beginning of SHA512 block operation */ + /* Load W */ +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + LOAD_DWORD_REV(t4, 0, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s1, 8, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s2, 16, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s3, 24, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s4, 32, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s5, 40, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s6, 48, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s7, 56, %[data], a4, a5, a6, a7) +#else + "ld t4, 0(%[data])\n\t" + "ld s1, 8(%[data])\n\t" + "ld s2, 16(%[data])\n\t" + "ld s3, 24(%[data])\n\t" + "ld s4, 32(%[data])\n\t" + "ld s5, 40(%[data])\n\t" + "ld s6, 48(%[data])\n\t" + "ld s7, 56(%[data])\n\t" + REV8(REG_T4, REG_T4) + REV8(REG_S1, REG_S1) + REV8(REG_S2, REG_S2) + REV8(REG_S3, REG_S3) + REV8(REG_S4, REG_S4) + REV8(REG_S5, REG_S5) + REV8(REG_S6, REG_S6) + REV8(REG_S7, REG_S7) +#endif + "sd t4, 0(sp)\n\t" + "sd s1, 8(sp)\n\t" + "sd s2, 16(sp)\n\t" + "sd s3, 24(sp)\n\t" + "sd s4, 32(sp)\n\t" + "sd s5, 40(sp)\n\t" + "sd s6, 48(sp)\n\t" + "sd s7, 56(sp)\n\t" +#ifndef WOLFSSL_RISCV_BASE_BIT_MANIPULATION + LOAD_DWORD_REV(t4, 64, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s1, 72, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s2, 80, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s3, 88, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s4, 96, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s5, 104, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s6, 112, %[data], a4, a5, a6, a7) + LOAD_DWORD_REV(s7, 120, %[data], a4, a5, a6, a7) +#else + "ld t4, 64(%[data])\n\t" + "ld s1, 72(%[data])\n\t" + "ld s2, 80(%[data])\n\t" + "ld s3, 88(%[data])\n\t" + "ld s4, 96(%[data])\n\t" + "ld s5, 104(%[data])\n\t" + "ld s6, 112(%[data])\n\t" + "ld s7, 120(%[data])\n\t" + REV8(REG_T4, REG_T4) + REV8(REG_S1, REG_S1) + REV8(REG_S2, REG_S2) + REV8(REG_S3, REG_S3) + REV8(REG_S4, REG_S4) + REV8(REG_S5, REG_S5) + REV8(REG_S6, REG_S6) + REV8(REG_S7, REG_S7) +#endif + "sd t4, 64(sp)\n\t" + "sd s1, 72(sp)\n\t" + "sd s2, 80(sp)\n\t" + "sd s3, 88(sp)\n\t" + "sd s4, 96(sp)\n\t" + "sd s5, 104(sp)\n\t" + "sd s6, 112(sp)\n\t" + "sd s7, 120(sp)\n\t" + + "\n2:\n\t" + /* Get W[0] */ + "ld s1, 0(sp)\n\t" + /* Get W[14] */ + "ld s4, 112(sp)\n\t" + /* Get W[15] */ + "ld s5, 120(sp)\n\t" + "addi %[blocks], %[blocks], -1\n\t" + RND16() + "andi a4, %[blocks], 3\n\t" + "add %[k], %[k], 128\n\t" + "bnez a4, 2b \n\t" + RND16_LAST() + "addi %[k], %[k], -512\n\t" + + "# Add working vars back into digest state.\n\t" + "ld t4, 0(%[digest])\n\t" + "ld s1, 8(%[digest])\n\t" + "ld s2, 16(%[digest])\n\t" + "ld s3, 24(%[digest])\n\t" + "ld s4, 32(%[digest])\n\t" + "ld s5, 40(%[digest])\n\t" + "ld s6, 48(%[digest])\n\t" + "ld s7, 56(%[digest])\n\t" + "add t0, t0, t4\n\t" + "add t1, t1, s1\n\t" + "add t2, t2, s2\n\t" + "add t3, t3, s3\n\t" + "add s8, s8, s4\n\t" + "add s9, s9, s5\n\t" + "add s10, s10, s6\n\t" + "add s11, s11, s7\n\t" + + /* Store digest. */ + "sd t0, 0(%[digest])\n\t" + "sd t1, 8(%[digest])\n\t" + "sd t2, 16(%[digest])\n\t" + "sd t3, 24(%[digest])\n\t" + "sd s8, 32(%[digest])\n\t" + "sd s9, 40(%[digest])\n\t" + "sd s10, 48(%[digest])\n\t" + "sd s11, 56(%[digest])\n\t" + + "add %[data], %[data], 128\n\t" + "bnez %[blocks], 1b \n\t" + + "addi sp, sp, 128\n\t" + + : [blocks] "+r" (blocks), [data] "+r" (data), [k] "+r" (k) + : [digest] "r" (sha512->digest) + : "cc", "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "a4", "a5", "a6", "a7", + "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", + "s11" + ); +} + +#else + +/* Two rounds of compression using low two W values. + * Assumes K has been added into W values. + */ +#define VSHA2CL_VV(vd, vs1, vs2) \ + ASM_WORD((0b101111 << 26) | (0b1 << 25) | \ + (0b010 << 12) | (0b1110111 << 0) | \ + (vd << 7) | (vs1 << 15) | (vs2 << 20)) + +/* Two rounds of compression using upper two W values. + * Assumes K has been added into W values. + */ +#define VSHA2CH_VV(vd, vs1, vs2) \ + ASM_WORD((0b101110 << 26) | (0b1 << 25) | \ + (0b010 << 12) | (0b1110111 << 0) | \ + (vd << 7) | (vs1 << 15) | (vs2 << 20)) + +/* Update 4 W values - message scheduling. */ +#define VSHA2MS_VV(vd, vs1, vs2) \ + ASM_WORD((0b101101 << 26) | (0b1 << 25) | \ + (0b010 << 12) | (0b1110111 << 0) | \ + (vd << 7) | (vs1 << 15) | (vs2 << 20)) + +#define RND4(w0, w2, w4, w6, k) \ + /* Four rounds of compression. */ \ + VADD_VV(REG_V14, w0, k) \ + VMV_X_S(REG_T1, w2) \ + VSHA2CL_VV(REG_V10, REG_V14, REG_V8) \ + VMV_V_V(REG_V12, w4) \ + VSHA2CH_VV(REG_V8, REG_V14, REG_V10) \ + /* Update 4 W values - message schedule. */ \ + VMV_S_X(REG_V12, REG_T1) \ + VSHA2MS_VV(w0, w6, REG_V12) + +#define RND4_LAST(w, k) \ + /* Four rounds of compression. */ \ + VADD_VV(REG_V14, w, k) \ + VSHA2CL_VV(REG_V10, REG_V14, REG_V8) \ + VSHA2CH_VV(REG_V8, REG_V14, REG_V10) + +#define RND16(k) \ + RND4(REG_V0, REG_V2, REG_V4, REG_V6, (k + 0)) \ + RND4(REG_V2, REG_V4, REG_V6, REG_V0, (k + 2)) \ + RND4(REG_V4, REG_V6, REG_V0, REG_V2, (k + 4)) \ + RND4(REG_V6, REG_V0, REG_V2, REG_V4, (k + 6)) + +#define RND16_LAST(k) \ + RND4_LAST(REG_V0, (k + 0)) \ + RND4_LAST(REG_V2, (k + 2)) \ + RND4_LAST(REG_V4, (k + 4)) \ + RND4_LAST(REG_V6, (k + 6)) + +/* Transform the message data. + * + * @param [in, out] sha512 SHA-512 object. + * @param [in] data Buffer of data to hash. + * @param [in] blocks Number of blocks of data to hash. + */ +static void Sha512Transform(wc_Sha512* sha512, const byte* data, + word32 blocks) +{ + word64* k = (word64*)K512; + + __asm__ __volatile__ ( + VSETIVLI(REG_ZERO, 4, 1, 1, 0b011, 0b001) + + /* Load: a|b|e|f, c|d|g|h + * 3 2 1 0 3 2 1 0 + */ + "mv t0, %[digest]\n\t" + VL4RE64_V(REG_V8, REG_T0) + + "\n1:\n\t" + VMVR_V(REG_V28, REG_V8, 4) + + /* Load 16 W into 8 vectors of 2 64-bit words. */ + "mv t0, %[data]\n\t" + VL8RE64_V(REG_V0, REG_T0) + VREV8(REG_V0, REG_V0) + VREV8(REG_V2, REG_V2) + VREV8(REG_V4, REG_V4) + VREV8(REG_V6, REG_V6) + + "mv t0, %[k]\n\t" + VL8RE64_V(REG_V16, REG_T0) + RND16(REG_V16) + "addi t0, %[k], 128\n\t" + VL8RE64_V(REG_V16, REG_T0) + RND16(REG_V16) + "addi t0, %[k], 256\n\t" + VL8RE64_V(REG_V16, REG_T0) + RND16(REG_V16) + "addi t0, %[k], 384\n\t" + VL8RE64_V(REG_V16, REG_T0) + RND16(REG_V16) + "addi t0, %[k], 512\n\t" + VL8RE64_V(REG_V16, REG_T0) + RND16_LAST(REG_V16) + + VADD_VV(REG_V8, REG_V8, REG_V28) + VADD_VV(REG_V10, REG_V10, REG_V30) + + "addi %[blocks], %[blocks], -1\n\t" + "add %[data], %[data], 128\n\t" + "bnez %[blocks], 1b \n\t" + + "mv t0, %[digest]\n\t" + VS4R_V(REG_V8, REG_T0) + + : [blocks] "+r" (blocks), [data] "+r" (data), [k] "+r" (k) + : [digest] "r" (sha512->digest) + : "cc", "memory", "t0", "t1" + ); +} + +#endif /* WOLFSSL_RISCV_VECTOR_CRYPTO_ASM */ + +/* Update the hash with data. + * + * @param [in, out] sha512 SHA-512 object. + * @param [in] data Buffer of data to hash. + * @param [in] len Number of bytes in buffer to hash. + * @return 0 on success. + */ +static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, + word32 len) +{ + word32 add; + word32 blocks; + + /* only perform actions if a buffer is passed in */ + if (len > 0) { + AddLength(sha512, len); + + if (sha512->buffLen > 0) { + /* fill leftover buffer with data */ + add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen); + XMEMCPY((byte*)(sha512->buffer) + sha512->buffLen, data, add); + sha512->buffLen += add; + data += add; + len -= add; + if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) { + Sha512Transform(sha512, (byte*)sha512->buffer, 1); + sha512->buffLen = 0; + } + } + + /* number of blocks in a row to complete */ + blocks = len / WC_SHA512_BLOCK_SIZE; + + if (blocks > 0) { + Sha512Transform(sha512, data, blocks); + data += blocks * WC_SHA512_BLOCK_SIZE; + len -= blocks * WC_SHA512_BLOCK_SIZE; + } + + if (len > 0) { + /* copy over any remaining data leftover */ + XMEMCPY(sha512->buffer, data, len); + sha512->buffLen = len; + } + } + + /* account for possibility of not used if len = 0 */ + (void)add; + (void)blocks; + + return 0; +} + +/* Finalize the hash and put into buffer. + * + * @param [in, out] sha512 SHA-512 object. + * @param [out] hash Buffer to hold hash result. + * @param [in] hashLen Length of hash to write out. + */ +static WC_INLINE void Sha512Final(wc_Sha512* sha512, byte* hash, int hashLen) +{ + byte* local; + byte hashBuf[WC_SHA512_DIGEST_SIZE]; + byte* hashRes = hash; + + if (hashLen < WC_SHA512_DIGEST_SIZE) { + hashRes = hashBuf; + } + + local = (byte*)sha512->buffer; + local[sha512->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha512->buffLen > WC_SHA512_PAD_SIZE) { + XMEMSET(&local[sha512->buffLen], 0, + WC_SHA512_BLOCK_SIZE - sha512->buffLen); + Sha512Transform(sha512, (byte*)sha512->buffer, 1); + sha512->buffLen = 0; + } + XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen); + + /* put lengths in bits */ + sha512->hiLen = (sha512->loLen >> (8*sizeof(sha512->loLen) - 3)) + + (sha512->hiLen << 3); + sha512->loLen = sha512->loLen << 3; + + sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen; + sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen; + + /* store lengths */ + __asm__ __volatile__ ( + /* Reverse byte order of 64-bit words. */ +#if defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION) + "ld t0, 112(%[buff])\n\t" + "ld t1, 120(%[buff])\n\t" + REV8(REG_T0, REG_T0) + REV8(REG_T1, REG_T1) +#else + LOAD_DWORD_REV(t0, 112, %[buff], t2, t3, t4, t5) + LOAD_DWORD_REV(t1, 120, %[buff], t2, t3, t4, t5) +#endif + "sd t0, 112(%[buff])\n\t" + "sd t1, 120(%[buff])\n\t" + : + : [buff] "r" (sha512->buffer) + : "cc", "memory", "t0", "t1", "t2", "t3", "t4", "t5" + ); + + Sha512Transform(sha512, (byte*)sha512->buffer, 1); + + __asm__ __volatile__ ( + /* Reverse byte order of 64-bit words. */ +#if defined(WOLFSSL_RISCV_VECTOR_CRYPTO_ASM) + VSETIVLI(REG_ZERO, 4, 1, 1, 0b011, 0b001) + "mv t0, %[digest]\n\t" + VL4RE64_V(REG_V4, REG_T0) + VREV8(REG_V4, REG_V4) + VREV8(REG_V6, REG_V6) + VSETIVLI(REG_ZERO, 2, 1, 1, 0b011, 0b000) + /* e|f, a|b, g|h, c|d + * 1 0 1 0 1 0 1 0 */ + VSLIDEDOWN_VI(REG_V0, REG_V5, 1) /* a */ + VSLIDEDOWN_VI(REG_V1, REG_V7, 1) /* c */ + VSLIDEDOWN_VI(REG_V2, REG_V4, 1) /* e */ + VSLIDEDOWN_VI(REG_V3, REG_V6, 1) /* g */ + VSLIDEUP_VI(REG_V0, REG_V5, 1) + VSLIDEUP_VI(REG_V1, REG_V7, 1) + VSLIDEUP_VI(REG_V2, REG_V4, 1) + VSLIDEUP_VI(REG_V3, REG_V6, 1) + "mv t0, %[hash]\n\t" + VS4R_V(REG_V0, REG_T0) +#elif defined(WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION) + VSETIVLI(REG_ZERO, 4, 1, 1, 0b011, 0b001) + "mv t0, %[digest]\n\t" + VL4RE64_V(REG_V0, REG_T0) + VREV8(REG_V0, REG_V0) + VREV8(REG_V2, REG_V2) + "mv t0, %[hash]\n\t" + VS4R_V(REG_V0, REG_T0) +#elif defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION) + "ld t0, 0(%[digest])\n\t" + "ld t1, 8(%[digest])\n\t" + "ld t2, 16(%[digest])\n\t" + "ld t3, 24(%[digest])\n\t" + "ld s8, 32(%[digest])\n\t" + "ld s9, 40(%[digest])\n\t" + "ld s10, 48(%[digest])\n\t" + "ld s11, 56(%[digest])\n\t" + REV8(REG_T0, REG_T0) + REV8(REG_T1, REG_T1) + REV8(REG_T2, REG_T2) + REV8(REG_T3, REG_T3) + REV8(REG_S8, REG_S8) + REV8(REG_S9, REG_S9) + REV8(REG_S10, REG_S10) + REV8(REG_S11, REG_S11) + "sd t0, 0(%[hash])\n\t" + "sd t1, 8(%[hash])\n\t" + "sd t2, 16(%[hash])\n\t" + "sd t3, 24(%[hash])\n\t" + "sd s8, 32(%[hash])\n\t" + "sd s9, 40(%[hash])\n\t" + "sd s10, 48(%[hash])\n\t" + "sd s11, 56(%[hash])\n\t" +#else + LOAD_DWORD_REV(t0, 0, %[digest], a4, a5, a6, a7) + LOAD_DWORD_REV(t1, 8, %[digest], a4, a5, a6, a7) + LOAD_DWORD_REV(t2, 16, %[digest], a4, a5, a6, a7) + LOAD_DWORD_REV(t3, 24, %[digest], a4, a5, a6, a7) + LOAD_DWORD_REV(s8, 32, %[digest], a4, a5, a6, a7) + LOAD_DWORD_REV(s9, 40, %[digest], a4, a5, a6, a7) + LOAD_DWORD_REV(s10, 48, %[digest], a4, a5, a6, a7) + LOAD_DWORD_REV(s11, 56, %[digest], a4, a5, a6, a7) + "sd t0, 0(%[hash])\n\t" + "sd t1, 8(%[hash])\n\t" + "sd t2, 16(%[hash])\n\t" + "sd t3, 24(%[hash])\n\t" + "sd s8, 32(%[hash])\n\t" + "sd s9, 40(%[hash])\n\t" + "sd s10, 48(%[hash])\n\t" + "sd s11, 56(%[hash])\n\t" +#endif + : + : [digest] "r" (sha512->digest), [hash] "r" (hashRes) + : "cc", "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", + "s8", "s9", "s10", "s11", "a4", "a5", "a6", "a7" + ); + + if (hashRes == hashBuf) { + XMEMCPY(hash, hashBuf, hashLen); + } +} + + +#ifndef NO_SHA512 + +/* Initialize SHA-512 object for hashing. + * + * @param [in, out] sha512 SHA-512 object. + * @param [in] heap Dynamic memory hint. + * @param [in] devId Device Id. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha512 is NULL. + */ +int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) +{ + int ret = InitSha512(sha512, heap, devId); + if (ret == 0) { + InitSha512_State(sha512); + } + return ret; +} + +/* Initialize SHA-512 object for hashing. + * + * @param [in, out] sha512 SHA-512 object. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha512 is NULL. + */ +int wc_InitSha512(wc_Sha512* sha512) +{ + return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID); +} + +/* Free the SHA-512 hash. + * + * @param [in] sha512 SHA-512 object. + */ +void wc_Sha512Free(wc_Sha512* sha512) +{ + /* No dynamic memory allocated. */ + (void)sha512; +} + +/* Update the hash with data. + * + * @param [in, out] sha512 SHA-512 object. + * @param [in] data Buffer of data to hash. + * @param [in] len Number of bytes in buffer to hash. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha512 is NULL. + * @return BAD_FUNC_ARG when data is NULL but len is not 0. + */ +int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) +{ + int ret; + + /* Validate parameters. */ + if ((sha512 == NULL) || ((data == NULL) && (len != 0))) { + ret = BAD_FUNC_ARG; + } + else { + ret = Sha512Update(sha512, data, len); + } + + return ret; +} + +/* Put the current hash into buffer. + * + * @param [in, out] sha512 SHA-512 object. + * @param [out] hash Buffer to hold hash result. + * @param [in] hashLen Length of hash to write out. + */ +static void Sha512FinalRaw(wc_Sha512* sha512, byte* hash, int hashLen) +{ + word32 digest[WC_SHA512_DIGEST_SIZE / sizeof(word32)]; + + ByteReverseWords64((word64*)digest, (word64*)sha512->digest, + WC_SHA512_DIGEST_SIZE); + XMEMCPY(hash, digest, hashLen); +} + +/* Put the current hash into buffer. + * + * @param [in, out] sha512 SHA-512 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha512 or hash is NULL. + */ +int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + Sha512FinalRaw(sha512, hash, WC_SHA512_DIGEST_SIZE); + } + + return ret; +} + +/* Finalize the hash and put into buffer. + * + * @param [in, out] sha512 SHA-512 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha512 or hash is NULL. + */ +int wc_Sha512Final(wc_Sha512* sha512, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + /* Finalize hash. */ + Sha512Final(sha512, hash, WC_SHA512_DIGEST_SIZE); + /* Restart SHA-512 object for next hash. */ + InitSha512_State(sha512); + } + + return ret; +} + +/* Finalize the hash and put into buffer but don't modify state. + * + * @param [in, out] sha512 SHA-512 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha512 or hash is NULL. + */ +int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash) +{ + int ret; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + wc_Sha512 tmpSha512; + /* Create a copy of the hash to finalize. */ + ret = wc_Sha512Copy(sha512, &tmpSha512); + if (ret == 0) { + /* Finalize copy. */ + Sha512Final(&tmpSha512, hash, WC_SHA512_DIGEST_SIZE); + wc_Sha512Free(&tmpSha512); + } + } + + return ret; +} + +#ifdef WOLFSSL_HASH_FLAGS +/* Set flags of SHA-512 object. + * + * @param [in, out] sha512 SHA-512 object. + * @param [in] flags Flags to set. + * @return 0 on success. + */ +int wc_Sha512SetFlags(wc_Sha512* sha512, word32 flags) +{ + /* Check we have an object to use. */ + if (sha512 != NULL) { + sha512->flags = flags; + } + return 0; +} +/* Get flags of SHA-512 object. + * + * @param [in] sha512 SHA-512 object. + * @param [out] flags Flags from SHA-512 object. + * @return 0 on success. + */ +int wc_Sha512GetFlags(wc_Sha512* sha512, word32* flags) +{ + /* Check we have an object and return parameter to use. */ + if ((sha512 != NULL) && (flags != NULL)) { + *flags = sha512->flags; + } + return 0; +} +#endif + +/* Deep copy the SHA-512 object. + * + * @param [in] src SHA-512 object to copy. + * @param [out] dst SHA-512 object to fill. + * @return 0 on success. + * @return BAD_FUNC_ARG when src or dst is NULL. + */ +int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst) +{ + int ret = 0; + + /* Validate parameters. */ + if ((src == NULL) || (dst == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + XMEMCPY(dst, src, sizeof(wc_Sha512)); + } + + return ret; +} + +#ifdef OPENSSL_EXTRA +/* Update the hash with one block of data. + * + * @param [in, out] sha512 SHA-512 object. + * @param [in] data Buffer of data to hash. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha512 or data is NULL. + */ +int wc_Sha512Transform(wc_Sha512* sha512, const unsigned char* data) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha512 == NULL) || (data == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + ByteReverseWords((word32*)sha512->buffer, (word32*)data, WC_SHA512_BLOCK_SIZE); + Sha512Transform(sha512, (byte*)sha512->buffer, 1); + } + + return ret; +} +#endif + +#if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_LMS_FULL_HASH) +/* Update the hash with one block of data and optionally get hash. + * + * @param [in, out] sha512 SHA-512 object. + * @param [in] data Buffer of data to hash. + * @param [out] hash Buffer to hold hash. May be NULL. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha512 or data is NULL. + */ +int wc_Sha512HashBlock(wc_Sha512* sha512, const unsigned char* data, + unsigned char* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha512 == NULL) || (data == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + /* Hash block. */ + Sha512Transform(sha512, data, 1); + + if (hash != NULL) { + /* Reverse bytes in digest. */ + word32* hash32 = (word32*)hash; + word32* digest = (word32*)sha512->digest; + hash32[0] = ByteReverseWord32(digest[0]); + hash32[1] = ByteReverseWord32(digest[1]); + hash32[2] = ByteReverseWord32(digest[2]); + hash32[3] = ByteReverseWord32(digest[3]); + hash32[4] = ByteReverseWord32(digest[4]); + hash32[5] = ByteReverseWord32(digest[5]); + hash32[6] = ByteReverseWord32(digest[6]); + hash32[7] = ByteReverseWord32(digest[7]); + /* Reset state. */ + #ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + sha512->digest[0] = 0x6A09E667L; + sha512->digest[1] = 0xBB67AE85L; + sha512->digest[2] = 0x3C6EF372L; + sha512->digest[3] = 0xA54FF53AL; + sha512->digest[4] = 0x510E527FL; + sha512->digest[5] = 0x9B05688CL; + sha512->digest[6] = 0x1F83D9ABL; + sha512->digest[7] = 0x5BE0CD19L; + #else + /* f, e, b, a, h, g, d, c */ + sha512->digest[0] = 0x9B05688CL; + sha512->digest[1] = 0x510E527FL; + sha512->digest[2] = 0xBB67AE85L; + sha512->digest[3] = 0x6A09E667L; + sha512->digest[4] = 0x5BE0CD19L; + sha512->digest[5] = 0x1F83D9ABL; + sha512->digest[6] = 0xA54FF53AL; + sha512->digest[7] = 0x3C6EF372L; + #endif + } + } + + return ret; +} +#endif /* WOLFSSL_HAVE_LMS && !WOLFSSL_LMS_FULL_HASH */ + +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) + +#if !defined(WOLFSSL_NOSHA512_224) + +int wc_InitSha512_224_ex(wc_Sha512* sha512, void* heap, int devId) +{ + int ret = InitSha512(sha512, heap, devId); + if (ret == 0) { + InitSha512_224_State(sha512); + } + return ret; +} +int wc_InitSha512_224(wc_Sha512* sha512) +{ + return wc_InitSha512_224_ex(sha512, NULL, INVALID_DEVID); +} +int wc_Sha512_224Update(wc_Sha512* sha512, const byte* data, word32 len) +{ + return wc_Sha512Update(sha512, data, len); +} +int wc_Sha512_224FinalRaw(wc_Sha512* sha512, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + Sha512FinalRaw(sha512, hash, WC_SHA512_224_DIGEST_SIZE); + } + + return ret; +} +int wc_Sha512_224Final(wc_Sha512* sha512, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + /* Finalize hash. */ + Sha512Final(sha512, hash, WC_SHA512_224_DIGEST_SIZE); + /* Restart SHA-512 object for next hash. */ + InitSha512_224_State(sha512); + } + + return ret; +} +void wc_Sha512_224Free(wc_Sha512* sha512) +{ + wc_Sha512Free(sha512); +} +int wc_Sha512_224GetHash(wc_Sha512* sha512, byte* hash) +{ + int ret; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + wc_Sha512 tmpSha512; + /* Create a copy of the hash to finalize. */ + ret = wc_Sha512Copy(sha512, &tmpSha512); + if (ret == 0) { + /* Finalize copy. */ + Sha512Final(&tmpSha512, hash, WC_SHA512_224_DIGEST_SIZE); + wc_Sha512Free(&tmpSha512); + } + } + + return ret; +} +int wc_Sha512_224Copy(wc_Sha512* src, wc_Sha512* dst) +{ + return wc_Sha512Copy(src, dst); +} + +#ifdef WOLFSSL_HASH_FLAGS +int wc_Sha512_224SetFlags(wc_Sha512* sha512, word32 flags) +{ + return wc_Sha512SetFlags(sha512, flags); +} +int wc_Sha512_224GetFlags(wc_Sha512* sha512, word32* flags) +{ + return wc_Sha512GetFlags(sha512, flags); +} +#endif /* WOLFSSL_HASH_FLAGS */ + +#if defined(OPENSSL_EXTRA) +int wc_Sha512_224Transform(wc_Sha512* sha512, const unsigned char* data) +{ + return wc_Sha512Transform(sha512, data); +} +#endif /* OPENSSL_EXTRA */ + +#endif /* !WOLFSSL_NOSHA512_224 */ + +#if !defined(WOLFSSL_NOSHA512_256) + +int wc_InitSha512_256_ex(wc_Sha512* sha512, void* heap, int devId) +{ + int ret = InitSha512(sha512, heap, devId); + if (ret == 0) { + InitSha512_256_State(sha512); + } + return ret; +} +int wc_InitSha512_256(wc_Sha512* sha512) +{ + return wc_InitSha512_256_ex(sha512, NULL, INVALID_DEVID); +} +int wc_Sha512_256Update(wc_Sha512* sha512, const byte* data, word32 len) +{ + return wc_Sha512Update(sha512, data, len); +} +int wc_Sha512_256FinalRaw(wc_Sha512* sha512, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + Sha512FinalRaw(sha512, hash, WC_SHA512_256_DIGEST_SIZE); + } + + return ret; +} +int wc_Sha512_256Final(wc_Sha512* sha512, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + /* Finalize hash. */ + Sha512Final(sha512, hash, WC_SHA512_256_DIGEST_SIZE); + /* Restart SHA-512 object for next hash. */ + InitSha512_256_State(sha512); + } + + return ret; +} +void wc_Sha512_256Free(wc_Sha512* sha512) +{ + wc_Sha512Free(sha512); +} +int wc_Sha512_256GetHash(wc_Sha512* sha512, byte* hash) +{ + int ret; + + /* Validate parameters. */ + if ((sha512 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + wc_Sha512 tmpSha512; + /* Create a copy of the hash to finalize. */ + ret = wc_Sha512Copy(sha512, &tmpSha512); + if (ret == 0) { + /* Finalize copy. */ + Sha512Final(&tmpSha512, hash, WC_SHA512_256_DIGEST_SIZE); + wc_Sha512Free(&tmpSha512); + } + } + + return ret; +} +int wc_Sha512_256Copy(wc_Sha512* src, wc_Sha512* dst) +{ + return wc_Sha512Copy(src, dst); +} + +#ifdef WOLFSSL_HASH_FLAGS +int wc_Sha512_256SetFlags(wc_Sha512* sha512, word32 flags) +{ + return wc_Sha512SetFlags(sha512, flags); +} +int wc_Sha512_256GetFlags(wc_Sha512* sha512, word32* flags) +{ + return wc_Sha512GetFlags(sha512, flags); +} +#endif /* WOLFSSL_HASH_FLAGS */ + +#if defined(OPENSSL_EXTRA) +int wc_Sha512_256Transform(wc_Sha512* sha512, const unsigned char* data) +{ + return wc_Sha512Transform(sha512, data); +} +#endif /* OPENSSL_EXTRA */ + +#endif /* !WOLFSSL_NOSHA512_224 */ + +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + +#endif /* !NO_SHA512 */ + + +#ifdef WOLFSSL_SHA384 + +/* Initialize SHA-384 object for hashing. + * + * @param [in, out] sha384 SHA-384 object. + */ +static void InitSha384(wc_Sha384* sha384) +{ + /* Set initial hash values. */ +#ifndef WOLFSSL_RISCV_VECTOR_CRYPTO_ASM + sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8); + sha384->digest[1] = W64LIT(0x629a292a367cd507); + sha384->digest[2] = W64LIT(0x9159015a3070dd17); + sha384->digest[3] = W64LIT(0x152fecd8f70e5939); + sha384->digest[4] = W64LIT(0x67332667ffc00b31); + sha384->digest[5] = W64LIT(0x8eb44a8768581511); + sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7); + sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4); +#else + /* f, e, b, a, h, g, d, c */ + sha384->digest[0] = W64LIT(0x8eb44a8768581511); + sha384->digest[1] = W64LIT(0x67332667ffc00b31); + sha384->digest[2] = W64LIT(0x629a292a367cd507); + sha384->digest[3] = W64LIT(0xcbbb9d5dc1059ed8); + sha384->digest[4] = W64LIT(0x47b5481dbefa4fa4); + sha384->digest[5] = W64LIT(0xdb0c2e0d64f98fa7); + sha384->digest[6] = W64LIT(0x152fecd8f70e5939); + sha384->digest[7] = W64LIT(0x9159015a3070dd17); +#endif + + /* No hashed data. */ + sha384->buffLen = 0; + /* No data hashed. */ + sha384->loLen = 0; + sha384->hiLen = 0; +} + +/* Initialize SHA-384 object for hashing. + * + * @param [in, out] sha384 SHA-384 object. + * @param [in] heap Dynamic memory hint. + * @param [in] devId Device Id. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha384 is NULL. + */ +int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId) +{ + int ret = InitSha512(sha384, heap, devId); + if (ret == 0) { + InitSha384(sha384); + } + return ret; +} + +/* Initialize SHA-384 object for hashing. + * + * @param [in, out] sha384 SHA-384 object. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha384 is NULL. + */ +int wc_InitSha384(wc_Sha384* sha384) +{ + return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID); +} + +/* Update the hash with data. + * + * @param [in, out] sha384 SHA-384 object. + * @param [in] data Buffer of data to hash. + * @param [in] len Number of bytes in buffer to hash. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha384 is NULL. + * @return BAD_FUNC_ARG when data is NULL but len is not 0. + */ +int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len) +{ + int ret; + + /* Validate parameters. */ + if ((sha384 == NULL) || ((data == NULL) && (len > 0))) { + ret = BAD_FUNC_ARG; + } + else { + ret = Sha512Update((wc_Sha512 *)sha384, data, len); + } + + return ret; +} + +/* Put the current hash into buffer. + * + * @param [in, out] sha384 SHA-384 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha384 or hash is NULL. + */ +int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash) +{ + word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)]; + + if (sha384 == NULL || hash == NULL) { + return BAD_FUNC_ARG; + } + + ByteReverseWords64((word64*)digest, (word64*)sha384->digest, + WC_SHA384_DIGEST_SIZE); + XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE); + + return 0; +} + +/* Finalize the hash and put into buffer. + * + * @param [in, out] sha384 SHA-384 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha384 or hash is NULL. + */ +int wc_Sha384Final(wc_Sha384* sha384, byte* hash) +{ + int ret = 0; + + /* Validate parameters. */ + if ((sha384 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + /* Finalize hash. */ + Sha512Final((wc_Sha512*)sha384, hash, WC_SHA384_DIGEST_SIZE); + /* Restart SHA-384 object for next hash. */ + InitSha384(sha384); + } + + return ret; +} + +/* Free the SHA-384 hash. + * + * @param [in] sha384 SHA-384 object. + */ +void wc_Sha384Free(wc_Sha384* sha384) +{ + /* No dynamic memory allocated. */ + (void)sha384; +} + +/* Finalize the hash and put into buffer but don't modify state. + * + * @param [in, out] sha384 SHA-384 object. + * @param [out] hash Buffer to hold hash result. + * @return 0 on success. + * @return BAD_FUNC_ARG when sha384 or hash is NULL. + */ +int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash) +{ + int ret; + + /* Validate parameters. */ + if ((sha384 == NULL) || (hash == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + wc_Sha384 tmpSha384; + /* Create a copy of the hash to finalize. */ + ret = wc_Sha384Copy(sha384, &tmpSha384); + if (ret == 0) { + /* Finalize copy. */ + ret = wc_Sha384Final(&tmpSha384, hash); + } + } + + return ret; +} + +#ifdef WOLFSSL_HASH_FLAGS +/* Set flags of SHA-384 object. + * + * @param [in, out] sha384 SHA-384 object. + * @param [in] flags Flags to set. + * @return 0 on success. + */ +int wc_Sha384SetFlags(wc_Sha384* sha384, word32 flags) +{ + /* Check we have an object to use. */ + if (sha384 != NULL) { + sha384->flags = flags; + } + return 0; +} +/* Get flags of SHA-384 object. + * + * @param [in] sha384 SHA-384 object. + * @param [out] flags Flags from SHA-384 object. + * @return 0 on success. + */ +int wc_Sha384GetFlags(wc_Sha384* sha384, word32* flags) +{ + /* Check we have an object and return parameter to use. */ + if ((sha384 != NULL) && (flags != NULL)) { + *flags = sha384->flags; + } + return 0; +} +#endif + +/* Deep copy the SHA-384 object. + * + * @param [in] src SHA-384 object to copy. + * @param [out] dst SHA-384 object to fill. + * @return 0 on success. + * @return BAD_FUNC_ARG when src or dst is NULL. + */ +int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst) +{ + int ret = 0; + + /* Validate parameters. */ + if ((src == NULL) || (dst == NULL)) { + ret = BAD_FUNC_ARG; + } + else { + XMEMCPY(dst, src, sizeof(wc_Sha384)); + } + + return ret; +} + +#endif /* WOLFSSL_SHA384 */ + +#endif /* !NO_SHA512 || WOLFSSL_SHA384 */ +#endif /* WOLFSSL_RISCV_ASM */ diff --git a/wolfcrypt/src/sha3_asm.S b/wolfcrypt/src/sha3_asm.S new file mode 100644 index 000000000..a67002073 --- /dev/null +++ b/wolfcrypt/src/sha3_asm.S @@ -0,0 +1,14883 @@ +/* sha3_asm.S */ +/* + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef WOLFSSL_USER_SETTINGS +#ifdef WOLFSSL_USER_SETTINGS_ASM +/* + * user_settings_asm.h is a file generated by the script user_settings_asm.sh. + * The script takes in a user_settings.h and produces user_settings_asm.h, which + * is a stripped down version of user_settings.h containing only preprocessor + * directives. This makes the header safe to include in assembly (.S) files. + */ +#include "user_settings_asm.h" +#else +/* + * Note: if user_settings.h contains any C code (e.g. a typedef or function + * prototype), including it here in an assembly (.S) file will cause an + * assembler failure. See user_settings_asm.h above. + */ +#include "user_settings.h" +#endif /* WOLFSSL_USER_SETTINGS_ASM */ +#endif /* WOLFSSL_USER_SETTINGS */ + +#ifndef HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX1 +#endif /* HAVE_INTEL_AVX1 */ +#ifndef NO_AVX2_SUPPORT +#define HAVE_INTEL_AVX2 +#endif /* NO_AVX2_SUPPORT */ + +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_sha3_avx2_r: +.quad 0x1,0x1 +.quad 0x1,0x1 +.quad 0x8082,0x8082 +.quad 0x8082,0x8082 +.quad 0x800000000000808a,0x800000000000808a +.quad 0x800000000000808a,0x800000000000808a +.quad 0x8000000080008000,0x8000000080008000 +.quad 0x8000000080008000,0x8000000080008000 +.quad 0x808b,0x808b +.quad 0x808b,0x808b +.quad 0x80000001,0x80000001 +.quad 0x80000001,0x80000001 +.quad 0x8000000080008081,0x8000000080008081 +.quad 0x8000000080008081,0x8000000080008081 +.quad 0x8000000000008009,0x8000000000008009 +.quad 0x8000000000008009,0x8000000000008009 +.quad 0x8a,0x8a +.quad 0x8a,0x8a +.quad 0x88,0x88 +.quad 0x88,0x88 +.quad 0x80008009,0x80008009 +.quad 0x80008009,0x80008009 +.quad 0x8000000a,0x8000000a +.quad 0x8000000a,0x8000000a +.quad 0x8000808b,0x8000808b +.quad 0x8000808b,0x8000808b +.quad 0x800000000000008b,0x800000000000008b +.quad 0x800000000000008b,0x800000000000008b +.quad 0x8000000000008089,0x8000000000008089 +.quad 0x8000000000008089,0x8000000000008089 +.quad 0x8000000000008003,0x8000000000008003 +.quad 0x8000000000008003,0x8000000000008003 +.quad 0x8000000000008002,0x8000000000008002 +.quad 0x8000000000008002,0x8000000000008002 +.quad 0x8000000000000080,0x8000000000000080 +.quad 0x8000000000000080,0x8000000000000080 +.quad 0x800a,0x800a +.quad 0x800a,0x800a +.quad 0x800000008000000a,0x800000008000000a +.quad 0x800000008000000a,0x800000008000000a +.quad 0x8000000080008081,0x8000000080008081 +.quad 0x8000000080008081,0x8000000080008081 +.quad 0x8000000000008080,0x8000000000008080 +.quad 0x8000000000008080,0x8000000000008080 +.quad 0x80000001,0x80000001 +.quad 0x80000001,0x80000001 +.quad 0x8000000080008008,0x8000000080008008 +.quad 0x8000000080008008,0x8000000080008008 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_sha3_x4_avx2_r: +.quad 0x1,0x1 +.quad 0x1,0x1 +.quad 0x8082,0x8082 +.quad 0x8082,0x8082 +.quad 0x800000000000808a,0x800000000000808a +.quad 0x800000000000808a,0x800000000000808a +.quad 0x8000000080008000,0x8000000080008000 +.quad 0x8000000080008000,0x8000000080008000 +.quad 0x808b,0x808b +.quad 0x808b,0x808b +.quad 0x80000001,0x80000001 +.quad 0x80000001,0x80000001 +.quad 0x8000000080008081,0x8000000080008081 +.quad 0x8000000080008081,0x8000000080008081 +.quad 0x8000000000008009,0x8000000000008009 +.quad 0x8000000000008009,0x8000000000008009 +.quad 0x8a,0x8a +.quad 0x8a,0x8a +.quad 0x88,0x88 +.quad 0x88,0x88 +.quad 0x80008009,0x80008009 +.quad 0x80008009,0x80008009 +.quad 0x8000000a,0x8000000a +.quad 0x8000000a,0x8000000a +.quad 0x8000808b,0x8000808b +.quad 0x8000808b,0x8000808b +.quad 0x800000000000008b,0x800000000000008b +.quad 0x800000000000008b,0x800000000000008b +.quad 0x8000000000008089,0x8000000000008089 +.quad 0x8000000000008089,0x8000000000008089 +.quad 0x8000000000008003,0x8000000000008003 +.quad 0x8000000000008003,0x8000000000008003 +.quad 0x8000000000008002,0x8000000000008002 +.quad 0x8000000000008002,0x8000000000008002 +.quad 0x8000000000000080,0x8000000000000080 +.quad 0x8000000000000080,0x8000000000000080 +.quad 0x800a,0x800a +.quad 0x800a,0x800a +.quad 0x800000008000000a,0x800000008000000a +.quad 0x800000008000000a,0x800000008000000a +.quad 0x8000000080008081,0x8000000080008081 +.quad 0x8000000080008081,0x8000000080008081 +.quad 0x8000000000008080,0x8000000000008080 +.quad 0x8000000000008080,0x8000000000008080 +.quad 0x80000001,0x80000001 +.quad 0x80000001,0x80000001 +.quad 0x8000000080008008,0x8000000080008008 +.quad 0x8000000080008008,0x8000000080008008 +#ifdef HAVE_INTEL_AVX2 +#ifndef __APPLE__ +.text +.globl sha3_block_bmi2 +.type sha3_block_bmi2,@function +.align 16 +sha3_block_bmi2: +#else +.section __TEXT,__text +.globl _sha3_block_bmi2 +.p2align 4 +_sha3_block_bmi2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq (%rdi), %rsi + addq $0x60, %rdi + # Round 0 + movq %rsi, %r10 + movq -88(%rdi), %r11 + movq -80(%rdi), %r12 + movq -72(%rdi), %r13 + movq -64(%rdi), %r14 + xorq -56(%rdi), %r10 + xorq -48(%rdi), %r11 + xorq -40(%rdi), %r12 + xorq -32(%rdi), %r13 + xorq -24(%rdi), %r14 + xorq -16(%rdi), %r10 + xorq -8(%rdi), %r11 + xorq (%rdi), %r12 + xorq 8(%rdi), %r13 + xorq 16(%rdi), %r14 + xorq 24(%rdi), %r10 + xorq 32(%rdi), %r11 + xorq 40(%rdi), %r12 + xorq 48(%rdi), %r13 + xorq 56(%rdi), %r14 + xorq 64(%rdi), %r10 + xorq 72(%rdi), %r11 + xorq 80(%rdi), %r12 + xorq 88(%rdi), %r13 + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -48(%rdi), %r11 + movq (%rdi), %r12 + movq 48(%rdi), %r13 + movq 96(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -48(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, (%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 48(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 96(%rdi) + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq $0x01, %rsi + # Row 1 + movq -72(%rdi), %r10 + movq -24(%rdi), %r11 + movq -16(%rdi), %r12 + movq 32(%rdi), %r13 + movq 80(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -24(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -16(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 32(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 80(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -72(%rdi) + # Row 2 + movq -88(%rdi), %r10 + movq -40(%rdi), %r11 + movq 8(%rdi), %r12 + movq 56(%rdi), %r13 + movq 64(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -40(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 8(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 56(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 64(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -88(%rdi) + # Row 3 + movq -64(%rdi), %r10 + movq -56(%rdi), %r11 + movq -8(%rdi), %r12 + movq 40(%rdi), %r13 + movq 88(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -56(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -8(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 40(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 88(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -64(%rdi) + # Row 4 + xorq -80(%rdi), %rcx + xorq -32(%rdi), %r8 + xorq 16(%rdi), %r9 + xorq 24(%rdi), %rdx + xorq 72(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -80(%rdi) + movq %r11, -32(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, 72(%rdi) + # Round 1 + xorq %rsi, %r10 + xorq -88(%rdi), %r10 + xorq -72(%rdi), %r10 + xorq -64(%rdi), %r10 + xorq -56(%rdi), %r11 + xorq -48(%rdi), %r11 + xorq -40(%rdi), %r11 + xorq -24(%rdi), %r11 + xorq -16(%rdi), %r12 + xorq -8(%rdi), %r12 + xorq (%rdi), %r12 + xorq 8(%rdi), %r12 + xorq 32(%rdi), %r13 + xorq 40(%rdi), %r13 + xorq 48(%rdi), %r13 + xorq 56(%rdi), %r13 + xorq 64(%rdi), %r14 + xorq 80(%rdi), %r14 + xorq 88(%rdi), %r14 + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -24(%rdi), %r11 + movq 8(%rdi), %r12 + movq 40(%rdi), %r13 + movq 72(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -24(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 8(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 40(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 72(%rdi) + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq $0x8082, %rsi + # Row 1 + movq 48(%rdi), %r10 + movq 80(%rdi), %r11 + movq -88(%rdi), %r12 + movq -56(%rdi), %r13 + movq 16(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 80(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -88(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -56(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 16(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 48(%rdi) + # Row 2 + movq -48(%rdi), %r10 + movq -16(%rdi), %r11 + movq 56(%rdi), %r12 + movq 88(%rdi), %r13 + movq -80(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -16(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 56(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 88(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -80(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -48(%rdi) + # Row 3 + movq 96(%rdi), %r10 + movq -72(%rdi), %r11 + movq -40(%rdi), %r12 + movq -8(%rdi), %r13 + movq 24(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -72(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -40(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -8(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 24(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 96(%rdi) + # Row 4 + xorq (%rdi), %rcx + xorq 32(%rdi), %r8 + xorq 64(%rdi), %r9 + xorq -64(%rdi), %rdx + xorq -32(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, (%rdi) + movq %r11, 32(%rdi) + movq %r12, 64(%rdi) + movq %r13, -64(%rdi) + movq %r14, -32(%rdi) + # Round 2 + xorq %rsi, %r10 + xorq -88(%rdi), %r12 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r11 + xorq -56(%rdi), %r13 + xorq -48(%rdi), %r10 + xorq -40(%rdi), %r12 + xorq -24(%rdi), %r11 + xorq -16(%rdi), %r11 + xorq -8(%rdi), %r13 + xorq 8(%rdi), %r12 + xorq 16(%rdi), %r14 + xorq 24(%rdi), %r14 + xorq 40(%rdi), %r13 + xorq 48(%rdi), %r10 + xorq 56(%rdi), %r12 + xorq 72(%rdi), %r14 + xorq 80(%rdi), %r11 + xorq 88(%rdi), %r13 + xorq 96(%rdi), %r10 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 80(%rdi), %r11 + movq 56(%rdi), %r12 + movq -8(%rdi), %r13 + movq -32(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 80(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 56(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -8(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -32(%rdi) + movq $0x800000000000808a, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 40(%rdi), %r10 + movq 16(%rdi), %r11 + movq -48(%rdi), %r12 + movq -72(%rdi), %r13 + movq 64(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 16(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -48(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -72(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 64(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 40(%rdi) + # Row 2 + movq -24(%rdi), %r10 + movq -88(%rdi), %r11 + movq 88(%rdi), %r12 + movq 24(%rdi), %r13 + movq (%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -88(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 88(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 24(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, (%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -24(%rdi) + # Row 3 + movq 72(%rdi), %r10 + movq 48(%rdi), %r11 + movq -16(%rdi), %r12 + movq -40(%rdi), %r13 + movq -64(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 48(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -16(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -40(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -64(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 72(%rdi) + # Row 4 + xorq 8(%rdi), %rcx + xorq -56(%rdi), %r8 + xorq -80(%rdi), %r9 + xorq 96(%rdi), %rdx + xorq 32(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 8(%rdi) + movq %r11, -56(%rdi) + movq %r12, -80(%rdi) + movq %r13, 96(%rdi) + movq %r14, 32(%rdi) + # Round 3 + xorq %rsi, %r10 + xorq -88(%rdi), %r11 + xorq -72(%rdi), %r13 + xorq -64(%rdi), %r14 + xorq -48(%rdi), %r12 + xorq -40(%rdi), %r13 + xorq -32(%rdi), %r14 + xorq -24(%rdi), %r10 + xorq -16(%rdi), %r12 + xorq -8(%rdi), %r13 + xorq (%rdi), %r14 + xorq 16(%rdi), %r11 + xorq 24(%rdi), %r13 + xorq 40(%rdi), %r10 + xorq 48(%rdi), %r11 + xorq 56(%rdi), %r12 + xorq 64(%rdi), %r14 + xorq 72(%rdi), %r10 + xorq 80(%rdi), %r11 + xorq 88(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 16(%rdi), %r11 + movq 88(%rdi), %r12 + movq -40(%rdi), %r13 + movq 32(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 16(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 88(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -40(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 32(%rdi) + movq $0x8000000080008000, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq -8(%rdi), %r10 + movq 64(%rdi), %r11 + movq -24(%rdi), %r12 + movq 48(%rdi), %r13 + movq -80(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 64(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -24(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 48(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -80(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -8(%rdi) + # Row 2 + movq 80(%rdi), %r10 + movq -48(%rdi), %r11 + movq 24(%rdi), %r12 + movq -64(%rdi), %r13 + movq 8(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -48(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 24(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -64(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 8(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 80(%rdi) + # Row 3 + movq -32(%rdi), %r10 + movq 40(%rdi), %r11 + movq -88(%rdi), %r12 + movq -16(%rdi), %r13 + movq 96(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 40(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -88(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -16(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 96(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -32(%rdi) + # Row 4 + xorq 56(%rdi), %rcx + xorq -72(%rdi), %r8 + xorq (%rdi), %r9 + xorq 72(%rdi), %rdx + xorq -56(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 56(%rdi) + movq %r11, -72(%rdi) + movq %r12, (%rdi) + movq %r13, 72(%rdi) + movq %r14, -56(%rdi) + # Round 4 + xorq %rsi, %r10 + xorq -88(%rdi), %r12 + xorq -80(%rdi), %r14 + xorq -64(%rdi), %r13 + xorq -48(%rdi), %r11 + xorq -40(%rdi), %r13 + xorq -32(%rdi), %r10 + xorq -24(%rdi), %r12 + xorq -16(%rdi), %r13 + xorq -8(%rdi), %r10 + xorq 8(%rdi), %r14 + xorq 16(%rdi), %r11 + xorq 24(%rdi), %r12 + xorq 32(%rdi), %r14 + xorq 40(%rdi), %r11 + xorq 48(%rdi), %r13 + xorq 64(%rdi), %r11 + xorq 80(%rdi), %r10 + xorq 88(%rdi), %r12 + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 64(%rdi), %r11 + movq 24(%rdi), %r12 + movq -16(%rdi), %r13 + movq -56(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 64(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 24(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -16(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -56(%rdi) + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq $0x808b, %rsi + # Row 1 + movq -40(%rdi), %r10 + movq -80(%rdi), %r11 + movq 80(%rdi), %r12 + movq 40(%rdi), %r13 + movq (%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -80(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 80(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 40(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, (%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -40(%rdi) + # Row 2 + movq 16(%rdi), %r10 + movq -24(%rdi), %r11 + movq -64(%rdi), %r12 + movq 96(%rdi), %r13 + movq 56(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -24(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -64(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 96(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 56(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 16(%rdi) + # Row 3 + movq 32(%rdi), %r10 + movq -8(%rdi), %r11 + movq -48(%rdi), %r12 + movq -88(%rdi), %r13 + movq 72(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -8(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -48(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -88(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 72(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 32(%rdi) + # Row 4 + xorq 88(%rdi), %rcx + xorq 48(%rdi), %r8 + xorq 8(%rdi), %r9 + xorq -32(%rdi), %rdx + xorq -72(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 88(%rdi) + movq %r11, 48(%rdi) + movq %r12, 8(%rdi) + movq %r13, -32(%rdi) + movq %r14, -72(%rdi) + # Round 5 + xorq %rsi, %r10 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r11 + xorq -64(%rdi), %r12 + xorq -56(%rdi), %r14 + xorq -48(%rdi), %r12 + xorq -40(%rdi), %r10 + xorq -24(%rdi), %r11 + xorq -16(%rdi), %r13 + xorq -8(%rdi), %r11 + xorq (%rdi), %r14 + xorq 16(%rdi), %r10 + xorq 24(%rdi), %r12 + xorq 32(%rdi), %r10 + xorq 40(%rdi), %r13 + xorq 56(%rdi), %r14 + xorq 64(%rdi), %r11 + xorq 72(%rdi), %r14 + xorq 80(%rdi), %r12 + xorq 96(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -80(%rdi), %r11 + movq -64(%rdi), %r12 + movq -88(%rdi), %r13 + movq -72(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -80(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -64(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -88(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -72(%rdi) + movq $0x80000001, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq -16(%rdi), %r10 + movq (%rdi), %r11 + movq 16(%rdi), %r12 + movq -8(%rdi), %r13 + movq 8(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, (%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 16(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -8(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 8(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -16(%rdi) + # Row 2 + movq 64(%rdi), %r10 + movq 80(%rdi), %r11 + movq 96(%rdi), %r12 + movq 72(%rdi), %r13 + movq 88(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 80(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 96(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 72(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 88(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 64(%rdi) + # Row 3 + movq -56(%rdi), %r10 + movq -40(%rdi), %r11 + movq -24(%rdi), %r12 + movq -48(%rdi), %r13 + movq -32(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -40(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -24(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -48(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -32(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -56(%rdi) + # Row 4 + xorq 24(%rdi), %rcx + xorq 40(%rdi), %r8 + xorq 56(%rdi), %r9 + xorq 32(%rdi), %rdx + xorq 48(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 24(%rdi) + movq %r11, 40(%rdi) + movq %r12, 56(%rdi) + movq %r13, 32(%rdi) + movq %r14, 48(%rdi) + # Round 6 + xorq %rsi, %r10 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r11 + xorq -72(%rdi), %r14 + xorq -64(%rdi), %r12 + xorq -56(%rdi), %r10 + xorq -48(%rdi), %r13 + xorq -40(%rdi), %r11 + xorq -32(%rdi), %r14 + xorq -24(%rdi), %r12 + xorq -16(%rdi), %r10 + xorq -8(%rdi), %r13 + xorq (%rdi), %r11 + xorq 8(%rdi), %r14 + xorq 16(%rdi), %r12 + xorq 64(%rdi), %r10 + xorq 72(%rdi), %r13 + xorq 80(%rdi), %r11 + xorq 88(%rdi), %r14 + xorq 96(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq (%rdi), %r11 + movq 96(%rdi), %r12 + movq -48(%rdi), %r13 + movq 48(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, (%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 96(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -48(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 48(%rdi) + movq $0x8000000080008081, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq -88(%rdi), %r10 + movq 8(%rdi), %r11 + movq 64(%rdi), %r12 + movq -40(%rdi), %r13 + movq 56(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 8(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 64(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -40(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 56(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -88(%rdi) + # Row 2 + movq -80(%rdi), %r10 + movq 16(%rdi), %r11 + movq 72(%rdi), %r12 + movq -32(%rdi), %r13 + movq 24(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 16(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 72(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -32(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 24(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -80(%rdi) + # Row 3 + movq -72(%rdi), %r10 + movq -16(%rdi), %r11 + movq 80(%rdi), %r12 + movq -24(%rdi), %r13 + movq 32(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -16(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 80(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -24(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 32(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -72(%rdi) + # Row 4 + xorq -64(%rdi), %rcx + xorq -8(%rdi), %r8 + xorq 88(%rdi), %r9 + xorq -56(%rdi), %rdx + xorq 40(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -64(%rdi) + movq %r11, -8(%rdi) + movq %r12, 88(%rdi) + movq %r13, -56(%rdi) + movq %r14, 40(%rdi) + # Round 7 + xorq %rsi, %r10 + xorq -88(%rdi), %r10 + xorq -80(%rdi), %r10 + xorq -72(%rdi), %r10 + xorq -48(%rdi), %r13 + xorq -40(%rdi), %r13 + xorq -32(%rdi), %r13 + xorq -24(%rdi), %r13 + xorq -16(%rdi), %r11 + xorq (%rdi), %r11 + xorq 8(%rdi), %r11 + xorq 16(%rdi), %r11 + xorq 24(%rdi), %r14 + xorq 32(%rdi), %r14 + xorq 48(%rdi), %r14 + xorq 56(%rdi), %r14 + xorq 64(%rdi), %r12 + xorq 72(%rdi), %r12 + xorq 80(%rdi), %r12 + xorq 96(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 8(%rdi), %r11 + movq 72(%rdi), %r12 + movq -24(%rdi), %r13 + movq 40(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 8(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 72(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -24(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 40(%rdi) + movq $0x8000000000008009, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq -48(%rdi), %r10 + movq 56(%rdi), %r11 + movq -80(%rdi), %r12 + movq -16(%rdi), %r13 + movq 88(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 56(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -80(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -16(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 88(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -48(%rdi) + # Row 2 + movq (%rdi), %r10 + movq 64(%rdi), %r11 + movq -32(%rdi), %r12 + movq 32(%rdi), %r13 + movq -64(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 64(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -32(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 32(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -64(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, (%rdi) + # Row 3 + movq 48(%rdi), %r10 + movq -88(%rdi), %r11 + movq 16(%rdi), %r12 + movq 80(%rdi), %r13 + movq -56(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -88(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 16(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 80(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -56(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 48(%rdi) + # Row 4 + xorq 96(%rdi), %rcx + xorq -40(%rdi), %r8 + xorq 24(%rdi), %r9 + xorq -72(%rdi), %rdx + xorq -8(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 96(%rdi) + movq %r11, -40(%rdi) + movq %r12, 24(%rdi) + movq %r13, -72(%rdi) + movq %r14, -8(%rdi) + # Round 8 + xorq %rsi, %r10 + xorq -88(%rdi), %r11 + xorq -80(%rdi), %r12 + xorq -64(%rdi), %r14 + xorq -56(%rdi), %r14 + xorq -48(%rdi), %r10 + xorq -32(%rdi), %r12 + xorq -24(%rdi), %r13 + xorq -16(%rdi), %r13 + xorq (%rdi), %r10 + xorq 8(%rdi), %r11 + xorq 16(%rdi), %r12 + xorq 32(%rdi), %r13 + xorq 40(%rdi), %r14 + xorq 48(%rdi), %r10 + xorq 56(%rdi), %r11 + xorq 64(%rdi), %r11 + xorq 72(%rdi), %r12 + xorq 80(%rdi), %r13 + xorq 88(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 56(%rdi), %r11 + movq -32(%rdi), %r12 + movq 80(%rdi), %r13 + movq -8(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 56(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -32(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 80(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -8(%rdi) + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq $0x8a, %rsi + # Row 1 + movq -24(%rdi), %r10 + movq 88(%rdi), %r11 + movq (%rdi), %r12 + movq -88(%rdi), %r13 + movq 24(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 88(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, (%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -88(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 24(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -24(%rdi) + # Row 2 + movq 8(%rdi), %r10 + movq -80(%rdi), %r11 + movq 32(%rdi), %r12 + movq -56(%rdi), %r13 + movq 96(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -80(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 32(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -56(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 96(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 8(%rdi) + # Row 3 + movq 40(%rdi), %r10 + movq -48(%rdi), %r11 + movq 64(%rdi), %r12 + movq 16(%rdi), %r13 + movq -72(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -48(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 64(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 16(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -72(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 40(%rdi) + # Row 4 + xorq 72(%rdi), %rcx + xorq -16(%rdi), %r8 + xorq -64(%rdi), %r9 + xorq 48(%rdi), %rdx + xorq -40(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 72(%rdi) + movq %r11, -16(%rdi) + movq %r12, -64(%rdi) + movq %r13, 48(%rdi) + movq %r14, -40(%rdi) + # Round 9 + xorq %rsi, %r10 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r11 + xorq -72(%rdi), %r14 + xorq -56(%rdi), %r13 + xorq -48(%rdi), %r11 + xorq -32(%rdi), %r12 + xorq -24(%rdi), %r10 + xorq -8(%rdi), %r14 + xorq (%rdi), %r12 + xorq 8(%rdi), %r10 + xorq 16(%rdi), %r13 + xorq 24(%rdi), %r14 + xorq 32(%rdi), %r12 + xorq 40(%rdi), %r10 + xorq 56(%rdi), %r11 + xorq 64(%rdi), %r12 + xorq 80(%rdi), %r13 + xorq 88(%rdi), %r11 + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 88(%rdi), %r11 + movq 32(%rdi), %r12 + movq 16(%rdi), %r13 + movq -40(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 88(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 32(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 16(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -40(%rdi) + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq $0x88, %rsi + # Row 1 + movq 80(%rdi), %r10 + movq 24(%rdi), %r11 + movq 8(%rdi), %r12 + movq -48(%rdi), %r13 + movq -64(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 24(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 8(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -48(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -64(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 80(%rdi) + # Row 2 + movq 56(%rdi), %r10 + movq (%rdi), %r11 + movq -56(%rdi), %r12 + movq -72(%rdi), %r13 + movq 72(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, (%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -56(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -72(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 72(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 56(%rdi) + # Row 3 + movq -8(%rdi), %r10 + movq -24(%rdi), %r11 + movq -80(%rdi), %r12 + movq 64(%rdi), %r13 + movq 48(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -24(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -80(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 64(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 48(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -8(%rdi) + # Row 4 + xorq -32(%rdi), %rcx + xorq -88(%rdi), %r8 + xorq 96(%rdi), %r9 + xorq 40(%rdi), %rdx + xorq -16(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -32(%rdi) + movq %r11, -88(%rdi) + movq %r12, 96(%rdi) + movq %r13, 40(%rdi) + movq %r14, -16(%rdi) + # Round 10 + xorq %rsi, %r10 + xorq -80(%rdi), %r12 + xorq -72(%rdi), %r13 + xorq -64(%rdi), %r14 + xorq -56(%rdi), %r12 + xorq -48(%rdi), %r13 + xorq -40(%rdi), %r14 + xorq -24(%rdi), %r11 + xorq -8(%rdi), %r10 + xorq (%rdi), %r11 + xorq 8(%rdi), %r12 + xorq 16(%rdi), %r13 + xorq 24(%rdi), %r11 + xorq 32(%rdi), %r12 + xorq 48(%rdi), %r14 + xorq 56(%rdi), %r10 + xorq 64(%rdi), %r13 + xorq 72(%rdi), %r14 + xorq 80(%rdi), %r10 + xorq 88(%rdi), %r11 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 24(%rdi), %r11 + movq -56(%rdi), %r12 + movq 64(%rdi), %r13 + movq -16(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 24(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -56(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 64(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -16(%rdi) + movq $0x80008009, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 16(%rdi), %r10 + movq -64(%rdi), %r11 + movq 56(%rdi), %r12 + movq -24(%rdi), %r13 + movq 96(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -64(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 56(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -24(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 96(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 16(%rdi) + # Row 2 + movq 88(%rdi), %r10 + movq 8(%rdi), %r11 + movq -72(%rdi), %r12 + movq 48(%rdi), %r13 + movq -32(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 8(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -72(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 48(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -32(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 88(%rdi) + # Row 3 + movq -40(%rdi), %r10 + movq 80(%rdi), %r11 + movq (%rdi), %r12 + movq -80(%rdi), %r13 + movq 40(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 80(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, (%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -80(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 40(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -40(%rdi) + # Row 4 + xorq 32(%rdi), %rcx + xorq -48(%rdi), %r8 + xorq 72(%rdi), %r9 + xorq -8(%rdi), %rdx + xorq -88(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 32(%rdi) + movq %r11, -48(%rdi) + movq %r12, 72(%rdi) + movq %r13, -8(%rdi) + movq %r14, -88(%rdi) + # Round 11 + xorq %rsi, %r10 + xorq -80(%rdi), %r13 + xorq -72(%rdi), %r12 + xorq -64(%rdi), %r11 + xorq -56(%rdi), %r12 + xorq -40(%rdi), %r10 + xorq -32(%rdi), %r14 + xorq -24(%rdi), %r13 + xorq -16(%rdi), %r14 + xorq (%rdi), %r12 + xorq 8(%rdi), %r11 + xorq 16(%rdi), %r10 + xorq 24(%rdi), %r11 + xorq 40(%rdi), %r14 + xorq 48(%rdi), %r13 + xorq 56(%rdi), %r12 + xorq 64(%rdi), %r13 + xorq 80(%rdi), %r11 + xorq 88(%rdi), %r10 + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -64(%rdi), %r11 + movq -72(%rdi), %r12 + movq -80(%rdi), %r13 + movq -88(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -64(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -72(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -80(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -88(%rdi) + movq $0x8000000a, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 64(%rdi), %r10 + movq 96(%rdi), %r11 + movq 88(%rdi), %r12 + movq 80(%rdi), %r13 + movq 72(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 96(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 88(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 80(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 72(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 64(%rdi) + # Row 2 + movq 24(%rdi), %r10 + movq 56(%rdi), %r11 + movq 48(%rdi), %r12 + movq 40(%rdi), %r13 + movq 32(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 56(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 48(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 40(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 32(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 24(%rdi) + # Row 3 + movq -16(%rdi), %r10 + movq 16(%rdi), %r11 + movq 8(%rdi), %r12 + movq (%rdi), %r13 + movq -8(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 16(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 8(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, (%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -8(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -16(%rdi) + # Row 4 + xorq -56(%rdi), %rcx + xorq -24(%rdi), %r8 + xorq -32(%rdi), %r9 + xorq -40(%rdi), %rdx + xorq -48(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -56(%rdi) + movq %r11, -24(%rdi) + movq %r12, -32(%rdi) + movq %r13, -40(%rdi) + movq %r14, -48(%rdi) + # Round 12 + xorq %rsi, %r10 + xorq -88(%rdi), %r14 + xorq -80(%rdi), %r13 + xorq -72(%rdi), %r12 + xorq -64(%rdi), %r11 + xorq -16(%rdi), %r10 + xorq -8(%rdi), %r14 + xorq (%rdi), %r13 + xorq 8(%rdi), %r12 + xorq 16(%rdi), %r11 + xorq 24(%rdi), %r10 + xorq 32(%rdi), %r14 + xorq 40(%rdi), %r13 + xorq 48(%rdi), %r12 + xorq 56(%rdi), %r11 + xorq 64(%rdi), %r10 + xorq 72(%rdi), %r14 + xorq 80(%rdi), %r13 + xorq 88(%rdi), %r12 + xorq 96(%rdi), %r11 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 96(%rdi), %r11 + movq 48(%rdi), %r12 + movq (%rdi), %r13 + movq -48(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 96(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 48(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, (%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -48(%rdi) + movq $0x8000808b, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq -80(%rdi), %r10 + movq 72(%rdi), %r11 + movq 24(%rdi), %r12 + movq 16(%rdi), %r13 + movq -32(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 72(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 24(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 16(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -32(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -80(%rdi) + # Row 2 + movq -64(%rdi), %r10 + movq 88(%rdi), %r11 + movq 40(%rdi), %r12 + movq -8(%rdi), %r13 + movq -56(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 88(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 40(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -8(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -56(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -64(%rdi) + # Row 3 + movq -88(%rdi), %r10 + movq 64(%rdi), %r11 + movq 56(%rdi), %r12 + movq 8(%rdi), %r13 + movq -40(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 64(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 56(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 8(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -40(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -88(%rdi) + # Row 4 + xorq -72(%rdi), %rcx + xorq 80(%rdi), %r8 + xorq 32(%rdi), %r9 + xorq -16(%rdi), %rdx + xorq -24(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -72(%rdi) + movq %r11, 80(%rdi) + movq %r12, 32(%rdi) + movq %r13, -16(%rdi) + movq %r14, -24(%rdi) + # Round 13 + xorq %rsi, %r10 + xorq -88(%rdi), %r10 + xorq -80(%rdi), %r10 + xorq -64(%rdi), %r10 + xorq -56(%rdi), %r14 + xorq -48(%rdi), %r14 + xorq -40(%rdi), %r14 + xorq -32(%rdi), %r14 + xorq -8(%rdi), %r13 + xorq (%rdi), %r13 + xorq 8(%rdi), %r13 + xorq 16(%rdi), %r13 + xorq 24(%rdi), %r12 + xorq 40(%rdi), %r12 + xorq 48(%rdi), %r12 + xorq 56(%rdi), %r12 + xorq 64(%rdi), %r11 + xorq 72(%rdi), %r11 + xorq 88(%rdi), %r11 + xorq 96(%rdi), %r11 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 72(%rdi), %r11 + movq 40(%rdi), %r12 + movq 8(%rdi), %r13 + movq -24(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 72(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 40(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 8(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -24(%rdi) + movq $0x800000000000008b, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq (%rdi), %r10 + movq -32(%rdi), %r11 + movq -64(%rdi), %r12 + movq 64(%rdi), %r13 + movq 32(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -32(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -64(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 64(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 32(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, (%rdi) + # Row 2 + movq 96(%rdi), %r10 + movq 24(%rdi), %r11 + movq -8(%rdi), %r12 + movq -40(%rdi), %r13 + movq -72(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 24(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -8(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -40(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -72(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 96(%rdi) + # Row 3 + movq -48(%rdi), %r10 + movq -80(%rdi), %r11 + movq 88(%rdi), %r12 + movq 56(%rdi), %r13 + movq -16(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -80(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 88(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 56(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -16(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -48(%rdi) + # Row 4 + xorq 48(%rdi), %rcx + xorq 16(%rdi), %r8 + xorq -56(%rdi), %r9 + xorq -88(%rdi), %rdx + xorq 80(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 48(%rdi) + movq %r11, 16(%rdi) + movq %r12, -56(%rdi) + movq %r13, -88(%rdi) + movq %r14, 80(%rdi) + # Round 14 + xorq %rsi, %r10 + xorq -80(%rdi), %r11 + xorq -72(%rdi), %r14 + xorq -64(%rdi), %r12 + xorq -48(%rdi), %r10 + xorq -40(%rdi), %r13 + xorq -32(%rdi), %r11 + xorq -24(%rdi), %r14 + xorq -16(%rdi), %r14 + xorq -8(%rdi), %r12 + xorq (%rdi), %r10 + xorq 8(%rdi), %r13 + xorq 24(%rdi), %r11 + xorq 32(%rdi), %r14 + xorq 40(%rdi), %r12 + xorq 56(%rdi), %r13 + xorq 64(%rdi), %r13 + xorq 72(%rdi), %r11 + xorq 88(%rdi), %r12 + xorq 96(%rdi), %r10 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -32(%rdi), %r11 + movq -8(%rdi), %r12 + movq 56(%rdi), %r13 + movq 80(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -32(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -8(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 56(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 80(%rdi) + movq $0x8000000000008089, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 8(%rdi), %r10 + movq 32(%rdi), %r11 + movq 96(%rdi), %r12 + movq -80(%rdi), %r13 + movq -56(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 32(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 96(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -80(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -56(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 8(%rdi) + # Row 2 + movq 72(%rdi), %r10 + movq -64(%rdi), %r11 + movq -40(%rdi), %r12 + movq -16(%rdi), %r13 + movq 48(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -64(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -40(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -16(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 48(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 72(%rdi) + # Row 3 + movq -24(%rdi), %r10 + movq (%rdi), %r11 + movq 24(%rdi), %r12 + movq 88(%rdi), %r13 + movq -88(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, (%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 24(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 88(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -88(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -24(%rdi) + # Row 4 + xorq 40(%rdi), %rcx + xorq 64(%rdi), %r8 + xorq -72(%rdi), %r9 + xorq -48(%rdi), %rdx + xorq 16(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 40(%rdi) + movq %r11, 64(%rdi) + movq %r12, -72(%rdi) + movq %r13, -48(%rdi) + movq %r14, 16(%rdi) + # Round 15 + xorq %rsi, %r10 + xorq -88(%rdi), %r14 + xorq -80(%rdi), %r13 + xorq -64(%rdi), %r11 + xorq -56(%rdi), %r14 + xorq -40(%rdi), %r12 + xorq -32(%rdi), %r11 + xorq -24(%rdi), %r10 + xorq -16(%rdi), %r13 + xorq -8(%rdi), %r12 + xorq (%rdi), %r11 + xorq 8(%rdi), %r10 + xorq 24(%rdi), %r12 + xorq 32(%rdi), %r11 + xorq 48(%rdi), %r14 + xorq 56(%rdi), %r13 + xorq 72(%rdi), %r10 + xorq 80(%rdi), %r14 + xorq 88(%rdi), %r13 + xorq 96(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 32(%rdi), %r11 + movq -40(%rdi), %r12 + movq 88(%rdi), %r13 + movq 16(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 32(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -40(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 88(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 16(%rdi) + movq $0x8000000000008003, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 56(%rdi), %r10 + movq -56(%rdi), %r11 + movq 72(%rdi), %r12 + movq (%rdi), %r13 + movq -72(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -56(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 72(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, (%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -72(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 56(%rdi) + # Row 2 + movq -32(%rdi), %r10 + movq 96(%rdi), %r11 + movq -16(%rdi), %r12 + movq -88(%rdi), %r13 + movq 40(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 96(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -16(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -88(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 40(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -32(%rdi) + # Row 3 + movq 80(%rdi), %r10 + movq 8(%rdi), %r11 + movq -64(%rdi), %r12 + movq 24(%rdi), %r13 + movq -48(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 8(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -64(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 24(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -48(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 80(%rdi) + # Row 4 + xorq -8(%rdi), %rcx + xorq -80(%rdi), %r8 + xorq 48(%rdi), %r9 + xorq -24(%rdi), %rdx + xorq 64(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -8(%rdi) + movq %r11, -80(%rdi) + movq %r12, 48(%rdi) + movq %r13, -24(%rdi) + movq %r14, 64(%rdi) + # Round 16 + xorq %rsi, %r10 + xorq -88(%rdi), %r13 + xorq -72(%rdi), %r14 + xorq -64(%rdi), %r12 + xorq -56(%rdi), %r11 + xorq -48(%rdi), %r14 + xorq -40(%rdi), %r12 + xorq -32(%rdi), %r10 + xorq -16(%rdi), %r12 + xorq (%rdi), %r13 + xorq 8(%rdi), %r11 + xorq 16(%rdi), %r14 + xorq 24(%rdi), %r13 + xorq 32(%rdi), %r11 + xorq 40(%rdi), %r14 + xorq 56(%rdi), %r10 + xorq 72(%rdi), %r12 + xorq 80(%rdi), %r10 + xorq 88(%rdi), %r13 + xorq 96(%rdi), %r11 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -56(%rdi), %r11 + movq -16(%rdi), %r12 + movq 24(%rdi), %r13 + movq 64(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -56(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -16(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 24(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 64(%rdi) + movq $0x8000000000008002, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 88(%rdi), %r10 + movq -72(%rdi), %r11 + movq -32(%rdi), %r12 + movq 8(%rdi), %r13 + movq 48(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -72(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -32(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 8(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 48(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 88(%rdi) + # Row 2 + movq 32(%rdi), %r10 + movq 72(%rdi), %r11 + movq -88(%rdi), %r12 + movq -48(%rdi), %r13 + movq -8(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 72(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -88(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -48(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -8(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 32(%rdi) + # Row 3 + movq 16(%rdi), %r10 + movq 56(%rdi), %r11 + movq 96(%rdi), %r12 + movq -64(%rdi), %r13 + movq -24(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 56(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 96(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -64(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -24(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 16(%rdi) + # Row 4 + xorq -40(%rdi), %rcx + xorq (%rdi), %r8 + xorq 40(%rdi), %r9 + xorq 80(%rdi), %rdx + xorq -80(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -40(%rdi) + movq %r11, (%rdi) + movq %r12, 40(%rdi) + movq %r13, 80(%rdi) + movq %r14, -80(%rdi) + # Round 17 + xorq %rsi, %r10 + xorq -88(%rdi), %r12 + xorq -72(%rdi), %r11 + xorq -64(%rdi), %r13 + xorq -56(%rdi), %r11 + xorq -48(%rdi), %r13 + xorq -32(%rdi), %r12 + xorq -24(%rdi), %r14 + xorq -16(%rdi), %r12 + xorq -8(%rdi), %r14 + xorq 8(%rdi), %r13 + xorq 16(%rdi), %r10 + xorq 24(%rdi), %r13 + xorq 32(%rdi), %r10 + xorq 48(%rdi), %r14 + xorq 56(%rdi), %r11 + xorq 64(%rdi), %r14 + xorq 72(%rdi), %r11 + xorq 88(%rdi), %r10 + xorq 96(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -72(%rdi), %r11 + movq -88(%rdi), %r12 + movq -64(%rdi), %r13 + movq -80(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -72(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -88(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -64(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -80(%rdi) + movq $0x8000000000000080, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 24(%rdi), %r10 + movq 48(%rdi), %r11 + movq 32(%rdi), %r12 + movq 56(%rdi), %r13 + movq 40(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 48(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 32(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 56(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 40(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 24(%rdi) + # Row 2 + movq -56(%rdi), %r10 + movq -32(%rdi), %r11 + movq -48(%rdi), %r12 + movq -24(%rdi), %r13 + movq -40(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -32(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -48(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -24(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -40(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -56(%rdi) + # Row 3 + movq 64(%rdi), %r10 + movq 88(%rdi), %r11 + movq 72(%rdi), %r12 + movq 96(%rdi), %r13 + movq 80(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 88(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 72(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 96(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 80(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 64(%rdi) + # Row 4 + xorq -16(%rdi), %rcx + xorq 8(%rdi), %r8 + xorq -8(%rdi), %r9 + xorq 16(%rdi), %rdx + xorq (%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -16(%rdi) + movq %r11, 8(%rdi) + movq %r12, -8(%rdi) + movq %r13, 16(%rdi) + movq %r14, (%rdi) + # Round 18 + xorq %rsi, %r10 + xorq -88(%rdi), %r12 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r11 + xorq -64(%rdi), %r13 + xorq -56(%rdi), %r10 + xorq -48(%rdi), %r12 + xorq -40(%rdi), %r14 + xorq -32(%rdi), %r11 + xorq -24(%rdi), %r13 + xorq 24(%rdi), %r10 + xorq 32(%rdi), %r12 + xorq 40(%rdi), %r14 + xorq 48(%rdi), %r11 + xorq 56(%rdi), %r13 + xorq 64(%rdi), %r10 + xorq 72(%rdi), %r12 + xorq 80(%rdi), %r14 + xorq 88(%rdi), %r11 + xorq 96(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 48(%rdi), %r11 + movq -48(%rdi), %r12 + movq 96(%rdi), %r13 + movq (%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 48(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -48(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 96(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, (%rdi) + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq $0x800a, %rsi + # Row 1 + movq -64(%rdi), %r10 + movq 40(%rdi), %r11 + movq -56(%rdi), %r12 + movq 88(%rdi), %r13 + movq -8(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 40(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -56(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 88(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -8(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -64(%rdi) + # Row 2 + movq -72(%rdi), %r10 + movq 32(%rdi), %r11 + movq -24(%rdi), %r12 + movq 80(%rdi), %r13 + movq -16(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 32(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -24(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 80(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -16(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -72(%rdi) + # Row 3 + movq -80(%rdi), %r10 + movq 24(%rdi), %r11 + movq -32(%rdi), %r12 + movq 72(%rdi), %r13 + movq 16(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 24(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -32(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 72(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 16(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -80(%rdi) + # Row 4 + xorq -88(%rdi), %rcx + xorq 56(%rdi), %r8 + xorq -40(%rdi), %r9 + xorq 64(%rdi), %rdx + xorq 8(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -88(%rdi) + movq %r11, 56(%rdi) + movq %r12, -40(%rdi) + movq %r13, 64(%rdi) + movq %r14, 8(%rdi) + # Round 19 + xorq %rsi, %r10 + xorq -80(%rdi), %r10 + xorq -72(%rdi), %r10 + xorq -64(%rdi), %r10 + xorq -56(%rdi), %r12 + xorq -48(%rdi), %r12 + xorq -32(%rdi), %r12 + xorq -24(%rdi), %r12 + xorq -16(%rdi), %r14 + xorq -8(%rdi), %r14 + xorq (%rdi), %r14 + xorq 16(%rdi), %r14 + xorq 24(%rdi), %r11 + xorq 32(%rdi), %r11 + xorq 40(%rdi), %r11 + xorq 48(%rdi), %r11 + xorq 72(%rdi), %r13 + xorq 80(%rdi), %r13 + xorq 88(%rdi), %r13 + xorq 96(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq 40(%rdi), %r11 + movq -24(%rdi), %r12 + movq 72(%rdi), %r13 + movq 8(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 40(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -24(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 72(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 8(%rdi) + movq $0x800000008000000a, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 96(%rdi), %r10 + movq -8(%rdi), %r11 + movq -72(%rdi), %r12 + movq 24(%rdi), %r13 + movq -40(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -8(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -72(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 24(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -40(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 96(%rdi) + # Row 2 + movq 48(%rdi), %r10 + movq -56(%rdi), %r11 + movq 80(%rdi), %r12 + movq 16(%rdi), %r13 + movq -88(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -56(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 80(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 16(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -88(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 48(%rdi) + # Row 3 + movq (%rdi), %r10 + movq -64(%rdi), %r11 + movq 32(%rdi), %r12 + movq -32(%rdi), %r13 + movq 64(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -64(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 32(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -32(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 64(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, (%rdi) + # Row 4 + xorq -48(%rdi), %rcx + xorq 88(%rdi), %r8 + xorq -16(%rdi), %r9 + xorq -80(%rdi), %rdx + xorq 56(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -48(%rdi) + movq %r11, 88(%rdi) + movq %r12, -16(%rdi) + movq %r13, -80(%rdi) + movq %r14, 56(%rdi) + # Round 20 + xorq %rsi, %r10 + xorq -88(%rdi), %r14 + xorq -72(%rdi), %r12 + xorq -64(%rdi), %r11 + xorq -56(%rdi), %r11 + xorq -40(%rdi), %r14 + xorq -32(%rdi), %r13 + xorq -24(%rdi), %r12 + xorq -8(%rdi), %r11 + xorq (%rdi), %r10 + xorq 8(%rdi), %r14 + xorq 16(%rdi), %r13 + xorq 24(%rdi), %r13 + xorq 32(%rdi), %r12 + xorq 40(%rdi), %r11 + xorq 48(%rdi), %r10 + xorq 64(%rdi), %r14 + xorq 72(%rdi), %r13 + xorq 80(%rdi), %r12 + xorq 96(%rdi), %r10 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -8(%rdi), %r11 + movq 80(%rdi), %r12 + movq -32(%rdi), %r13 + movq 56(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -8(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 80(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -32(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 56(%rdi) + movq $0x8000000080008081, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 72(%rdi), %r10 + movq -40(%rdi), %r11 + movq 48(%rdi), %r12 + movq -64(%rdi), %r13 + movq -16(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -40(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 48(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -64(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -16(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 72(%rdi) + # Row 2 + movq 40(%rdi), %r10 + movq -72(%rdi), %r11 + movq 16(%rdi), %r12 + movq 64(%rdi), %r13 + movq -48(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -72(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 16(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 64(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -48(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 40(%rdi) + # Row 3 + movq 8(%rdi), %r10 + movq 96(%rdi), %r11 + movq -56(%rdi), %r12 + movq 32(%rdi), %r13 + movq -80(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 96(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -56(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 32(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -80(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 8(%rdi) + # Row 4 + xorq -24(%rdi), %rcx + xorq 24(%rdi), %r8 + xorq -88(%rdi), %r9 + xorq (%rdi), %rdx + xorq 88(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, -24(%rdi) + movq %r11, 24(%rdi) + movq %r12, -88(%rdi) + movq %r13, (%rdi) + movq %r14, 88(%rdi) + # Round 21 + xorq %rsi, %r10 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r11 + xorq -64(%rdi), %r13 + xorq -56(%rdi), %r12 + xorq -48(%rdi), %r14 + xorq -40(%rdi), %r11 + xorq -32(%rdi), %r13 + xorq -16(%rdi), %r14 + xorq -8(%rdi), %r11 + xorq 8(%rdi), %r10 + xorq 16(%rdi), %r12 + xorq 32(%rdi), %r13 + xorq 40(%rdi), %r10 + xorq 48(%rdi), %r12 + xorq 56(%rdi), %r14 + xorq 64(%rdi), %r13 + xorq 72(%rdi), %r10 + xorq 80(%rdi), %r12 + xorq 96(%rdi), %r11 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -40(%rdi), %r11 + movq 16(%rdi), %r12 + movq 32(%rdi), %r13 + movq 88(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -40(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 16(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 32(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 88(%rdi) + movq $0x8000000000008080, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq -32(%rdi), %r10 + movq -16(%rdi), %r11 + movq 40(%rdi), %r12 + movq 96(%rdi), %r13 + movq -88(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -16(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 40(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 96(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -88(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -32(%rdi) + # Row 2 + movq -8(%rdi), %r10 + movq 48(%rdi), %r11 + movq 64(%rdi), %r12 + movq -80(%rdi), %r13 + movq -24(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 48(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 64(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -80(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -24(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -8(%rdi) + # Row 3 + movq 56(%rdi), %r10 + movq 72(%rdi), %r11 + movq -72(%rdi), %r12 + movq -56(%rdi), %r13 + movq (%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 72(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -72(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -56(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, (%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 56(%rdi) + # Row 4 + xorq 80(%rdi), %rcx + xorq -64(%rdi), %r8 + xorq -48(%rdi), %r9 + xorq 8(%rdi), %rdx + xorq 24(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 80(%rdi) + movq %r11, -64(%rdi) + movq %r12, -48(%rdi) + movq %r13, 8(%rdi) + movq %r14, 24(%rdi) + # Round 22 + xorq %rsi, %r10 + xorq -88(%rdi), %r14 + xorq -80(%rdi), %r13 + xorq -72(%rdi), %r12 + xorq -56(%rdi), %r13 + xorq -40(%rdi), %r11 + xorq -32(%rdi), %r10 + xorq -24(%rdi), %r14 + xorq -16(%rdi), %r11 + xorq -8(%rdi), %r10 + xorq (%rdi), %r14 + xorq 16(%rdi), %r12 + xorq 32(%rdi), %r13 + xorq 40(%rdi), %r12 + xorq 48(%rdi), %r11 + xorq 56(%rdi), %r10 + xorq 64(%rdi), %r12 + xorq 72(%rdi), %r11 + xorq 88(%rdi), %r14 + xorq 96(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -16(%rdi), %r11 + movq 64(%rdi), %r12 + movq -56(%rdi), %r13 + movq 24(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -16(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 64(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -56(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 24(%rdi) + movq $0x80000001, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq 32(%rdi), %r10 + movq -88(%rdi), %r11 + movq -8(%rdi), %r12 + movq 72(%rdi), %r13 + movq -48(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -88(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -8(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 72(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -48(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 32(%rdi) + # Row 2 + movq -40(%rdi), %r10 + movq 40(%rdi), %r11 + movq -80(%rdi), %r12 + movq (%rdi), %r13 + movq 80(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 40(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -80(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, (%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 80(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -40(%rdi) + # Row 3 + movq 88(%rdi), %r10 + movq -32(%rdi), %r11 + movq 48(%rdi), %r12 + movq -72(%rdi), %r13 + movq 8(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -32(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 48(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -72(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 8(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 88(%rdi) + # Row 4 + xorq 16(%rdi), %rcx + xorq 96(%rdi), %r8 + xorq -24(%rdi), %r9 + xorq 56(%rdi), %rdx + xorq -64(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 16(%rdi) + movq %r11, 96(%rdi) + movq %r12, -24(%rdi) + movq %r13, 56(%rdi) + movq %r14, -64(%rdi) + # Round 23 + xorq %rsi, %r10 + xorq -88(%rdi), %r11 + xorq -80(%rdi), %r12 + xorq -72(%rdi), %r13 + xorq -56(%rdi), %r13 + xorq -48(%rdi), %r14 + xorq -40(%rdi), %r10 + xorq -32(%rdi), %r11 + xorq -16(%rdi), %r11 + xorq -8(%rdi), %r12 + xorq (%rdi), %r13 + xorq 8(%rdi), %r14 + xorq 24(%rdi), %r14 + xorq 32(%rdi), %r10 + xorq 40(%rdi), %r11 + xorq 48(%rdi), %r12 + xorq 64(%rdi), %r12 + xorq 72(%rdi), %r13 + xorq 80(%rdi), %r14 + xorq 88(%rdi), %r10 + # Calc t[0..4] + rorxq $63, %r11, %rdx + rorxq $63, %r12, %rax + rorxq $63, %r13, %rcx + rorxq $63, %r14, %r8 + rorxq $63, %r10, %r9 + xorq %r14, %rdx + xorq %r10, %rax + xorq %r11, %rcx + xorq %r12, %r8 + xorq %r13, %r9 + # Row Mix + # Row 0 + movq %rsi, %r10 + movq -88(%rdi), %r11 + movq -80(%rdi), %r12 + movq -72(%rdi), %r13 + movq -64(%rdi), %r14 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + rolq $44, %r11 + rolq $43, %r12 + rolq $21, %r13 + rolq $14, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -88(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -80(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -72(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -64(%rdi) + movq $0x8000000080008008, %r14 + andnq %r12, %r11, %rsi + xorq %r10, %rsi + # XOR in constant + xorq %r14, %rsi + # Row 1 + movq -56(%rdi), %r10 + movq -48(%rdi), %r11 + movq -40(%rdi), %r12 + movq -32(%rdi), %r13 + movq -24(%rdi), %r14 + xorq %r8, %r10 + xorq %r9, %r11 + xorq %rdx, %r12 + xorq %rax, %r13 + xorq %rcx, %r14 + rolq $28, %r10 + rolq $20, %r11 + rolq $3, %r12 + rolq $45, %r13 + rolq $61, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -48(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, -40(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, -32(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, -24(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -56(%rdi) + # Row 2 + movq -16(%rdi), %r10 + movq -8(%rdi), %r11 + movq (%rdi), %r12 + movq 8(%rdi), %r13 + movq 16(%rdi), %r14 + xorq %rax, %r10 + xorq %rcx, %r11 + xorq %r8, %r12 + xorq %r9, %r13 + xorq %rdx, %r14 + rolq $0x01, %r10 + rolq $6, %r11 + rolq $25, %r12 + rolq $8, %r13 + rolq $18, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, -8(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, (%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 8(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 16(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, -16(%rdi) + # Row 3 + movq 24(%rdi), %r10 + movq 32(%rdi), %r11 + movq 40(%rdi), %r12 + movq 48(%rdi), %r13 + movq 56(%rdi), %r14 + xorq %r9, %r10 + xorq %rdx, %r11 + xorq %rax, %r12 + xorq %rcx, %r13 + xorq %r8, %r14 + rolq $27, %r10 + rolq $36, %r11 + rolq $10, %r12 + rolq $15, %r13 + rolq $56, %r14 + andnq %r13, %r12, %r15 + xorq %r11, %r15 + movq %r15, 32(%rdi) + andnq %r14, %r13, %r15 + xorq %r12, %r15 + movq %r15, 40(%rdi) + andnq %r10, %r14, %r15 + xorq %r13, %r15 + movq %r15, 48(%rdi) + andnq %r11, %r10, %r15 + xorq %r14, %r15 + movq %r15, 56(%rdi) + andnq %r12, %r11, %r15 + xorq %r10, %r15 + movq %r15, 24(%rdi) + # Row 4 + xorq 64(%rdi), %rcx + xorq 72(%rdi), %r8 + xorq 80(%rdi), %r9 + xorq 88(%rdi), %rdx + xorq 96(%rdi), %rax + rorxq $2, %rcx, %r10 + rorxq $9, %r8, %r11 + rorxq $25, %r9, %r12 + rorxq $23, %rdx, %r13 + rorxq $62, %rax, %r14 + andnq %r12, %r11, %rdx + andnq %r13, %r12, %rax + andnq %r14, %r13, %rcx + andnq %r10, %r14, %r8 + andnq %r11, %r10, %r9 + xorq %rdx, %r10 + xorq %rax, %r11 + xorq %rcx, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + movq %r10, 64(%rdi) + movq %r11, 72(%rdi) + movq %r12, 80(%rdi) + movq %r13, 88(%rdi) + movq %r14, 96(%rdi) + movq %rsi, -96(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sha3_block_bmi2,.-sha3_block_bmi2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl sha3_block_n_bmi2 +.type sha3_block_n_bmi2,@function +.align 16 +sha3_block_n_bmi2: +#else +.section __TEXT,__text +.globl _sha3_block_n_bmi2 +.p2align 4 +_sha3_block_n_bmi2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + pushq %rcx + movq %rcx, %rbp + movq (%rdi), %rcx + addq $0x60, %rdi +L_sha3_block_n_bmi2_start: + cmpq $0x88, %rbp + je L_sha3_block_n_bmi2_load_256 + cmpq $0xa8, %rbp + je L_sha3_block_n_bmi2_load_128 + cmpq $0x90, %rbp + je L_sha3_block_n_bmi2_load_224 + cmpq $0x68, %rbp + je L_sha3_block_n_bmi2_load_384 + movq (%rsi), %r12 + movq 8(%rsi), %r13 + movq 16(%rsi), %r14 + movq 24(%rsi), %r15 + movq 32(%rsi), %rbx + movq 40(%rsi), %rax + movq 48(%rsi), %r8 + movq 56(%rsi), %r9 + movq 64(%rsi), %r10 + xorq %rcx, %r12 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r15 + xorq -64(%rdi), %rbx + xorq -56(%rdi), %rax + xorq -48(%rdi), %r8 + xorq -40(%rdi), %r9 + xorq -32(%rdi), %r10 + movq %r12, %rcx + movq %r13, -88(%rdi) + movq %r14, -80(%rdi) + movq %r15, -72(%rdi) + movq %rbx, -64(%rdi) + movq %rax, -56(%rdi) + movq %r8, -48(%rdi) + movq %r9, -40(%rdi) + movq %r10, -32(%rdi) + jmp L_sha3_block_n_bmi2_rounds +L_sha3_block_n_bmi2_load_128: + movq (%rsi), %r12 + movq 8(%rsi), %r13 + movq 16(%rsi), %r14 + movq 24(%rsi), %r15 + movq 32(%rsi), %rbx + xorq %rcx, %r12 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r15 + xorq -64(%rdi), %rbx + movq %r12, %rcx + movq %r13, -88(%rdi) + movq %r14, -80(%rdi) + movq %r15, -72(%rdi) + movq %rbx, -64(%rdi) + movq 40(%rsi), %rax + movq 48(%rsi), %r8 + movq 56(%rsi), %r9 + movq 64(%rsi), %r10 + movq 72(%rsi), %r11 + movq 80(%rsi), %rbp + xorq -56(%rdi), %rax + xorq -48(%rdi), %r8 + xorq -40(%rdi), %r9 + xorq -32(%rdi), %r10 + xorq -24(%rdi), %r11 + xorq -16(%rdi), %rbp + movq %rax, -56(%rdi) + movq %r8, -48(%rdi) + movq %r9, -40(%rdi) + movq %r10, -32(%rdi) + movq %r11, -24(%rdi) + movq %rbp, -16(%rdi) + movq 88(%rsi), %rax + movq 96(%rsi), %r8 + movq 104(%rsi), %r9 + movq 112(%rsi), %r10 + movq 120(%rsi), %r11 + movq 128(%rsi), %rbp + xorq -8(%rdi), %rax + xorq (%rdi), %r8 + xorq 8(%rdi), %r9 + xorq 16(%rdi), %r10 + xorq 24(%rdi), %r11 + xorq 32(%rdi), %rbp + movq %rax, -8(%rdi) + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %rbp, 32(%rdi) + movq 136(%rsi), %rax + movq 144(%rsi), %r8 + movq 152(%rsi), %r9 + movq 160(%rsi), %r10 + xorq 40(%rdi), %rax + xorq 48(%rdi), %r8 + xorq 56(%rdi), %r9 + xorq 64(%rdi), %r10 + movq %rax, 40(%rdi) + movq %r8, 48(%rdi) + movq %r9, 56(%rdi) + movq %r10, 64(%rdi) + jmp L_sha3_block_n_bmi2_rounds +L_sha3_block_n_bmi2_load_224: + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + movq 64(%rsi), %r15 + movq 72(%rsi), %rbx + movq 80(%rsi), %rax + movq 88(%rsi), %r8 + movq 96(%rsi), %r9 + movq 104(%rsi), %r10 + movq 112(%rsi), %r11 + xorq -56(%rdi), %r12 + xorq -48(%rdi), %r13 + xorq -40(%rdi), %r14 + xorq -32(%rdi), %r15 + xorq -24(%rdi), %rbx + xorq -16(%rdi), %rax + xorq -8(%rdi), %r8 + xorq (%rdi), %r9 + xorq 8(%rdi), %r10 + xorq 16(%rdi), %r11 + movq %r12, -56(%rdi) + movq %r13, -48(%rdi) + movq %r14, -40(%rdi) + movq %r15, -32(%rdi) + movq %rbx, -24(%rdi) + movq %rax, -16(%rdi) + movq %r8, -8(%rdi) + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq (%rsi), %r12 + movq 8(%rsi), %r13 + movq 16(%rsi), %r14 + movq 24(%rsi), %r15 + movq 32(%rsi), %rbx + movq 120(%rsi), %rax + movq 128(%rsi), %r8 + movq 136(%rsi), %r9 + xorq %rcx, %r12 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r15 + xorq -64(%rdi), %rbx + xorq 24(%rdi), %rax + xorq 32(%rdi), %r8 + xorq 40(%rdi), %r9 + movq %r12, %rcx + movq %r13, -88(%rdi) + movq %r14, -80(%rdi) + movq %r15, -72(%rdi) + movq %rbx, -64(%rdi) + movq %rax, 24(%rdi) + movq %r8, 32(%rdi) + movq %r9, 40(%rdi) + jmp L_sha3_block_n_bmi2_rounds +L_sha3_block_n_bmi2_load_384: + movq (%rsi), %r12 + movq 8(%rsi), %r13 + movq 16(%rsi), %r14 + movq 24(%rsi), %r15 + movq 32(%rsi), %rbx + movq 40(%rsi), %rax + movq 48(%rsi), %r8 + movq 56(%rsi), %r9 + movq 64(%rsi), %r10 + xorq %rcx, %r12 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r15 + xorq -64(%rdi), %rbx + xorq -56(%rdi), %rax + xorq -48(%rdi), %r8 + xorq -40(%rdi), %r9 + xorq -32(%rdi), %r10 + movq %r12, %rcx + movq %r13, -88(%rdi) + movq %r14, -80(%rdi) + movq %r15, -72(%rdi) + movq %rbx, -64(%rdi) + movq %rax, -56(%rdi) + movq %r8, -48(%rdi) + movq %r9, -40(%rdi) + movq %r10, -32(%rdi) + movq 72(%rsi), %rax + movq 80(%rsi), %r8 + movq 88(%rsi), %r9 + movq 96(%rsi), %r10 + xorq -24(%rdi), %rax + xorq -16(%rdi), %r8 + xorq -8(%rdi), %r9 + xorq (%rdi), %r10 + movq %rax, -24(%rdi) + movq %r8, -16(%rdi) + movq %r9, -8(%rdi) + movq %r10, (%rdi) + jmp L_sha3_block_n_bmi2_rounds +L_sha3_block_n_bmi2_load_256: + movq (%rsi), %r12 + movq 8(%rsi), %r13 + movq 16(%rsi), %r14 + movq 24(%rsi), %r15 + movq 32(%rsi), %rbx + movq 40(%rsi), %rax + movq 48(%rsi), %r8 + movq 56(%rsi), %r9 + movq 64(%rsi), %r10 + movq 72(%rsi), %r11 + movq 80(%rsi), %rbp + xorq %rcx, %r12 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r15 + xorq -64(%rdi), %rbx + xorq -56(%rdi), %rax + xorq -48(%rdi), %r8 + xorq -40(%rdi), %r9 + xorq -32(%rdi), %r10 + xorq -24(%rdi), %r11 + xorq -16(%rdi), %rbp + movq %r12, %rcx + movq %r13, -88(%rdi) + movq %r14, -80(%rdi) + movq %r15, -72(%rdi) + movq %rbx, -64(%rdi) + movq %rax, -56(%rdi) + movq %r8, -48(%rdi) + movq %r9, -40(%rdi) + movq %r10, -32(%rdi) + movq %r11, -24(%rdi) + movq %rbp, -16(%rdi) + movq 88(%rsi), %rax + movq 96(%rsi), %r8 + movq 104(%rsi), %r9 + movq 112(%rsi), %r10 + movq 120(%rsi), %r11 + movq 128(%rsi), %rbp + xorq -8(%rdi), %rax + xorq (%rdi), %r8 + xorq 8(%rdi), %r9 + xorq 16(%rdi), %r10 + xorq 24(%rdi), %r11 + xorq 32(%rdi), %rbp + movq %rax, -8(%rdi) + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %rbp, 32(%rdi) +L_sha3_block_n_bmi2_rounds: + # Round 0 + xorq -56(%rdi), %r12 + xorq -48(%rdi), %r13 + xorq -40(%rdi), %r14 + xorq -32(%rdi), %r15 + xorq -24(%rdi), %rbx + xorq -16(%rdi), %r12 + xorq -8(%rdi), %r13 + xorq (%rdi), %r14 + xorq 8(%rdi), %r15 + xorq 16(%rdi), %rbx + xorq 24(%rdi), %r12 + xorq 32(%rdi), %r13 + xorq 40(%rdi), %r14 + xorq 48(%rdi), %r15 + xorq 56(%rdi), %rbx + xorq 64(%rdi), %r12 + xorq 72(%rdi), %r13 + xorq 80(%rdi), %r14 + xorq 88(%rdi), %r15 + xorq 96(%rdi), %rbx + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -48(%rdi), %r13 + movq (%rdi), %r14 + movq 48(%rdi), %r15 + movq 96(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -48(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, (%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 48(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 96(%rdi) + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq $0x01, %rcx + # Row 1 + movq -72(%rdi), %r12 + movq -24(%rdi), %r13 + movq -16(%rdi), %r14 + movq 32(%rdi), %r15 + movq 80(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -24(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -16(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 32(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 80(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -72(%rdi) + # Row 2 + movq -88(%rdi), %r12 + movq -40(%rdi), %r13 + movq 8(%rdi), %r14 + movq 56(%rdi), %r15 + movq 64(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -40(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 8(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 56(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 64(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -88(%rdi) + # Row 3 + movq -64(%rdi), %r12 + movq -56(%rdi), %r13 + movq -8(%rdi), %r14 + movq 40(%rdi), %r15 + movq 88(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -56(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -8(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 40(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 88(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -64(%rdi) + # Row 4 + xorq -80(%rdi), %r9 + xorq -32(%rdi), %r10 + xorq 16(%rdi), %r11 + xorq 24(%rdi), %rax + xorq 72(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -80(%rdi) + movq %r13, -32(%rdi) + movq %r14, 16(%rdi) + movq %r15, 24(%rdi) + movq %rbx, 72(%rdi) + # Round 1 + xorq %rcx, %r12 + xorq -88(%rdi), %r12 + xorq -72(%rdi), %r12 + xorq -64(%rdi), %r12 + xorq -56(%rdi), %r13 + xorq -48(%rdi), %r13 + xorq -40(%rdi), %r13 + xorq -24(%rdi), %r13 + xorq -16(%rdi), %r14 + xorq -8(%rdi), %r14 + xorq (%rdi), %r14 + xorq 8(%rdi), %r14 + xorq 32(%rdi), %r15 + xorq 40(%rdi), %r15 + xorq 48(%rdi), %r15 + xorq 56(%rdi), %r15 + xorq 64(%rdi), %rbx + xorq 80(%rdi), %rbx + xorq 88(%rdi), %rbx + xorq 96(%rdi), %rbx + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -24(%rdi), %r13 + movq 8(%rdi), %r14 + movq 40(%rdi), %r15 + movq 72(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -24(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 8(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 40(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 72(%rdi) + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq $0x8082, %rcx + # Row 1 + movq 48(%rdi), %r12 + movq 80(%rdi), %r13 + movq -88(%rdi), %r14 + movq -56(%rdi), %r15 + movq 16(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 80(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -88(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -56(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 16(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 48(%rdi) + # Row 2 + movq -48(%rdi), %r12 + movq -16(%rdi), %r13 + movq 56(%rdi), %r14 + movq 88(%rdi), %r15 + movq -80(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -16(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 56(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 88(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -80(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -48(%rdi) + # Row 3 + movq 96(%rdi), %r12 + movq -72(%rdi), %r13 + movq -40(%rdi), %r14 + movq -8(%rdi), %r15 + movq 24(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -72(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -40(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -8(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 24(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 96(%rdi) + # Row 4 + xorq (%rdi), %r9 + xorq 32(%rdi), %r10 + xorq 64(%rdi), %r11 + xorq -64(%rdi), %rax + xorq -32(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, (%rdi) + movq %r13, 32(%rdi) + movq %r14, 64(%rdi) + movq %r15, -64(%rdi) + movq %rbx, -32(%rdi) + # Round 2 + xorq %rcx, %r12 + xorq -88(%rdi), %r14 + xorq -80(%rdi), %rbx + xorq -72(%rdi), %r13 + xorq -56(%rdi), %r15 + xorq -48(%rdi), %r12 + xorq -40(%rdi), %r14 + xorq -24(%rdi), %r13 + xorq -16(%rdi), %r13 + xorq -8(%rdi), %r15 + xorq 8(%rdi), %r14 + xorq 16(%rdi), %rbx + xorq 24(%rdi), %rbx + xorq 40(%rdi), %r15 + xorq 48(%rdi), %r12 + xorq 56(%rdi), %r14 + xorq 72(%rdi), %rbx + xorq 80(%rdi), %r13 + xorq 88(%rdi), %r15 + xorq 96(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 80(%rdi), %r13 + movq 56(%rdi), %r14 + movq -8(%rdi), %r15 + movq -32(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 80(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 56(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -8(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -32(%rdi) + movq $0x800000000000808a, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 40(%rdi), %r12 + movq 16(%rdi), %r13 + movq -48(%rdi), %r14 + movq -72(%rdi), %r15 + movq 64(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 16(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -48(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -72(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 64(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 40(%rdi) + # Row 2 + movq -24(%rdi), %r12 + movq -88(%rdi), %r13 + movq 88(%rdi), %r14 + movq 24(%rdi), %r15 + movq (%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -88(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 88(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 24(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, (%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -24(%rdi) + # Row 3 + movq 72(%rdi), %r12 + movq 48(%rdi), %r13 + movq -16(%rdi), %r14 + movq -40(%rdi), %r15 + movq -64(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 48(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -16(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -40(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -64(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 72(%rdi) + # Row 4 + xorq 8(%rdi), %r9 + xorq -56(%rdi), %r10 + xorq -80(%rdi), %r11 + xorq 96(%rdi), %rax + xorq 32(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 8(%rdi) + movq %r13, -56(%rdi) + movq %r14, -80(%rdi) + movq %r15, 96(%rdi) + movq %rbx, 32(%rdi) + # Round 3 + xorq %rcx, %r12 + xorq -88(%rdi), %r13 + xorq -72(%rdi), %r15 + xorq -64(%rdi), %rbx + xorq -48(%rdi), %r14 + xorq -40(%rdi), %r15 + xorq -32(%rdi), %rbx + xorq -24(%rdi), %r12 + xorq -16(%rdi), %r14 + xorq -8(%rdi), %r15 + xorq (%rdi), %rbx + xorq 16(%rdi), %r13 + xorq 24(%rdi), %r15 + xorq 40(%rdi), %r12 + xorq 48(%rdi), %r13 + xorq 56(%rdi), %r14 + xorq 64(%rdi), %rbx + xorq 72(%rdi), %r12 + xorq 80(%rdi), %r13 + xorq 88(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 16(%rdi), %r13 + movq 88(%rdi), %r14 + movq -40(%rdi), %r15 + movq 32(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 16(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 88(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -40(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 32(%rdi) + movq $0x8000000080008000, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq -8(%rdi), %r12 + movq 64(%rdi), %r13 + movq -24(%rdi), %r14 + movq 48(%rdi), %r15 + movq -80(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 64(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -24(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 48(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -80(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -8(%rdi) + # Row 2 + movq 80(%rdi), %r12 + movq -48(%rdi), %r13 + movq 24(%rdi), %r14 + movq -64(%rdi), %r15 + movq 8(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -48(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 24(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -64(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 8(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 80(%rdi) + # Row 3 + movq -32(%rdi), %r12 + movq 40(%rdi), %r13 + movq -88(%rdi), %r14 + movq -16(%rdi), %r15 + movq 96(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 40(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -88(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -16(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 96(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -32(%rdi) + # Row 4 + xorq 56(%rdi), %r9 + xorq -72(%rdi), %r10 + xorq (%rdi), %r11 + xorq 72(%rdi), %rax + xorq -56(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 56(%rdi) + movq %r13, -72(%rdi) + movq %r14, (%rdi) + movq %r15, 72(%rdi) + movq %rbx, -56(%rdi) + # Round 4 + xorq %rcx, %r12 + xorq -88(%rdi), %r14 + xorq -80(%rdi), %rbx + xorq -64(%rdi), %r15 + xorq -48(%rdi), %r13 + xorq -40(%rdi), %r15 + xorq -32(%rdi), %r12 + xorq -24(%rdi), %r14 + xorq -16(%rdi), %r15 + xorq -8(%rdi), %r12 + xorq 8(%rdi), %rbx + xorq 16(%rdi), %r13 + xorq 24(%rdi), %r14 + xorq 32(%rdi), %rbx + xorq 40(%rdi), %r13 + xorq 48(%rdi), %r15 + xorq 64(%rdi), %r13 + xorq 80(%rdi), %r12 + xorq 88(%rdi), %r14 + xorq 96(%rdi), %rbx + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 64(%rdi), %r13 + movq 24(%rdi), %r14 + movq -16(%rdi), %r15 + movq -56(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 64(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 24(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -16(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -56(%rdi) + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq $0x808b, %rcx + # Row 1 + movq -40(%rdi), %r12 + movq -80(%rdi), %r13 + movq 80(%rdi), %r14 + movq 40(%rdi), %r15 + movq (%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -80(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 80(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 40(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, (%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -40(%rdi) + # Row 2 + movq 16(%rdi), %r12 + movq -24(%rdi), %r13 + movq -64(%rdi), %r14 + movq 96(%rdi), %r15 + movq 56(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -24(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -64(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 96(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 56(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 16(%rdi) + # Row 3 + movq 32(%rdi), %r12 + movq -8(%rdi), %r13 + movq -48(%rdi), %r14 + movq -88(%rdi), %r15 + movq 72(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -8(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -48(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -88(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 72(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 32(%rdi) + # Row 4 + xorq 88(%rdi), %r9 + xorq 48(%rdi), %r10 + xorq 8(%rdi), %r11 + xorq -32(%rdi), %rax + xorq -72(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 88(%rdi) + movq %r13, 48(%rdi) + movq %r14, 8(%rdi) + movq %r15, -32(%rdi) + movq %rbx, -72(%rdi) + # Round 5 + xorq %rcx, %r12 + xorq -88(%rdi), %r15 + xorq -80(%rdi), %r13 + xorq -64(%rdi), %r14 + xorq -56(%rdi), %rbx + xorq -48(%rdi), %r14 + xorq -40(%rdi), %r12 + xorq -24(%rdi), %r13 + xorq -16(%rdi), %r15 + xorq -8(%rdi), %r13 + xorq (%rdi), %rbx + xorq 16(%rdi), %r12 + xorq 24(%rdi), %r14 + xorq 32(%rdi), %r12 + xorq 40(%rdi), %r15 + xorq 56(%rdi), %rbx + xorq 64(%rdi), %r13 + xorq 72(%rdi), %rbx + xorq 80(%rdi), %r14 + xorq 96(%rdi), %r15 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -80(%rdi), %r13 + movq -64(%rdi), %r14 + movq -88(%rdi), %r15 + movq -72(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -80(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -64(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -88(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -72(%rdi) + movq $0x80000001, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq -16(%rdi), %r12 + movq (%rdi), %r13 + movq 16(%rdi), %r14 + movq -8(%rdi), %r15 + movq 8(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, (%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 16(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -8(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 8(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -16(%rdi) + # Row 2 + movq 64(%rdi), %r12 + movq 80(%rdi), %r13 + movq 96(%rdi), %r14 + movq 72(%rdi), %r15 + movq 88(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 80(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 96(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 72(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 88(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 64(%rdi) + # Row 3 + movq -56(%rdi), %r12 + movq -40(%rdi), %r13 + movq -24(%rdi), %r14 + movq -48(%rdi), %r15 + movq -32(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -40(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -24(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -48(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -32(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -56(%rdi) + # Row 4 + xorq 24(%rdi), %r9 + xorq 40(%rdi), %r10 + xorq 56(%rdi), %r11 + xorq 32(%rdi), %rax + xorq 48(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 24(%rdi) + movq %r13, 40(%rdi) + movq %r14, 56(%rdi) + movq %r15, 32(%rdi) + movq %rbx, 48(%rdi) + # Round 6 + xorq %rcx, %r12 + xorq -88(%rdi), %r15 + xorq -80(%rdi), %r13 + xorq -72(%rdi), %rbx + xorq -64(%rdi), %r14 + xorq -56(%rdi), %r12 + xorq -48(%rdi), %r15 + xorq -40(%rdi), %r13 + xorq -32(%rdi), %rbx + xorq -24(%rdi), %r14 + xorq -16(%rdi), %r12 + xorq -8(%rdi), %r15 + xorq (%rdi), %r13 + xorq 8(%rdi), %rbx + xorq 16(%rdi), %r14 + xorq 64(%rdi), %r12 + xorq 72(%rdi), %r15 + xorq 80(%rdi), %r13 + xorq 88(%rdi), %rbx + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq (%rdi), %r13 + movq 96(%rdi), %r14 + movq -48(%rdi), %r15 + movq 48(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, (%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 96(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -48(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 48(%rdi) + movq $0x8000000080008081, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq -88(%rdi), %r12 + movq 8(%rdi), %r13 + movq 64(%rdi), %r14 + movq -40(%rdi), %r15 + movq 56(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 8(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 64(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -40(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 56(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -88(%rdi) + # Row 2 + movq -80(%rdi), %r12 + movq 16(%rdi), %r13 + movq 72(%rdi), %r14 + movq -32(%rdi), %r15 + movq 24(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 16(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 72(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -32(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 24(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -80(%rdi) + # Row 3 + movq -72(%rdi), %r12 + movq -16(%rdi), %r13 + movq 80(%rdi), %r14 + movq -24(%rdi), %r15 + movq 32(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -16(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 80(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -24(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 32(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -72(%rdi) + # Row 4 + xorq -64(%rdi), %r9 + xorq -8(%rdi), %r10 + xorq 88(%rdi), %r11 + xorq -56(%rdi), %rax + xorq 40(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -64(%rdi) + movq %r13, -8(%rdi) + movq %r14, 88(%rdi) + movq %r15, -56(%rdi) + movq %rbx, 40(%rdi) + # Round 7 + xorq %rcx, %r12 + xorq -88(%rdi), %r12 + xorq -80(%rdi), %r12 + xorq -72(%rdi), %r12 + xorq -48(%rdi), %r15 + xorq -40(%rdi), %r15 + xorq -32(%rdi), %r15 + xorq -24(%rdi), %r15 + xorq -16(%rdi), %r13 + xorq (%rdi), %r13 + xorq 8(%rdi), %r13 + xorq 16(%rdi), %r13 + xorq 24(%rdi), %rbx + xorq 32(%rdi), %rbx + xorq 48(%rdi), %rbx + xorq 56(%rdi), %rbx + xorq 64(%rdi), %r14 + xorq 72(%rdi), %r14 + xorq 80(%rdi), %r14 + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 8(%rdi), %r13 + movq 72(%rdi), %r14 + movq -24(%rdi), %r15 + movq 40(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 8(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 72(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -24(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 40(%rdi) + movq $0x8000000000008009, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq -48(%rdi), %r12 + movq 56(%rdi), %r13 + movq -80(%rdi), %r14 + movq -16(%rdi), %r15 + movq 88(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 56(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -80(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -16(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 88(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -48(%rdi) + # Row 2 + movq (%rdi), %r12 + movq 64(%rdi), %r13 + movq -32(%rdi), %r14 + movq 32(%rdi), %r15 + movq -64(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 64(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -32(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 32(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -64(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, (%rdi) + # Row 3 + movq 48(%rdi), %r12 + movq -88(%rdi), %r13 + movq 16(%rdi), %r14 + movq 80(%rdi), %r15 + movq -56(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -88(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 16(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 80(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -56(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 48(%rdi) + # Row 4 + xorq 96(%rdi), %r9 + xorq -40(%rdi), %r10 + xorq 24(%rdi), %r11 + xorq -72(%rdi), %rax + xorq -8(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 96(%rdi) + movq %r13, -40(%rdi) + movq %r14, 24(%rdi) + movq %r15, -72(%rdi) + movq %rbx, -8(%rdi) + # Round 8 + xorq %rcx, %r12 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r14 + xorq -64(%rdi), %rbx + xorq -56(%rdi), %rbx + xorq -48(%rdi), %r12 + xorq -32(%rdi), %r14 + xorq -24(%rdi), %r15 + xorq -16(%rdi), %r15 + xorq (%rdi), %r12 + xorq 8(%rdi), %r13 + xorq 16(%rdi), %r14 + xorq 32(%rdi), %r15 + xorq 40(%rdi), %rbx + xorq 48(%rdi), %r12 + xorq 56(%rdi), %r13 + xorq 64(%rdi), %r13 + xorq 72(%rdi), %r14 + xorq 80(%rdi), %r15 + xorq 88(%rdi), %rbx + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 56(%rdi), %r13 + movq -32(%rdi), %r14 + movq 80(%rdi), %r15 + movq -8(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 56(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -32(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 80(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -8(%rdi) + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq $0x8a, %rcx + # Row 1 + movq -24(%rdi), %r12 + movq 88(%rdi), %r13 + movq (%rdi), %r14 + movq -88(%rdi), %r15 + movq 24(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 88(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, (%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -88(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 24(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -24(%rdi) + # Row 2 + movq 8(%rdi), %r12 + movq -80(%rdi), %r13 + movq 32(%rdi), %r14 + movq -56(%rdi), %r15 + movq 96(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -80(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 32(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -56(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 96(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 8(%rdi) + # Row 3 + movq 40(%rdi), %r12 + movq -48(%rdi), %r13 + movq 64(%rdi), %r14 + movq 16(%rdi), %r15 + movq -72(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -48(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 64(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 16(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -72(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 40(%rdi) + # Row 4 + xorq 72(%rdi), %r9 + xorq -16(%rdi), %r10 + xorq -64(%rdi), %r11 + xorq 48(%rdi), %rax + xorq -40(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 72(%rdi) + movq %r13, -16(%rdi) + movq %r14, -64(%rdi) + movq %r15, 48(%rdi) + movq %rbx, -40(%rdi) + # Round 9 + xorq %rcx, %r12 + xorq -88(%rdi), %r15 + xorq -80(%rdi), %r13 + xorq -72(%rdi), %rbx + xorq -56(%rdi), %r15 + xorq -48(%rdi), %r13 + xorq -32(%rdi), %r14 + xorq -24(%rdi), %r12 + xorq -8(%rdi), %rbx + xorq (%rdi), %r14 + xorq 8(%rdi), %r12 + xorq 16(%rdi), %r15 + xorq 24(%rdi), %rbx + xorq 32(%rdi), %r14 + xorq 40(%rdi), %r12 + xorq 56(%rdi), %r13 + xorq 64(%rdi), %r14 + xorq 80(%rdi), %r15 + xorq 88(%rdi), %r13 + xorq 96(%rdi), %rbx + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 88(%rdi), %r13 + movq 32(%rdi), %r14 + movq 16(%rdi), %r15 + movq -40(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 88(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 32(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 16(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -40(%rdi) + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq $0x88, %rcx + # Row 1 + movq 80(%rdi), %r12 + movq 24(%rdi), %r13 + movq 8(%rdi), %r14 + movq -48(%rdi), %r15 + movq -64(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 24(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 8(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -48(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -64(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 80(%rdi) + # Row 2 + movq 56(%rdi), %r12 + movq (%rdi), %r13 + movq -56(%rdi), %r14 + movq -72(%rdi), %r15 + movq 72(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, (%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -56(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -72(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 72(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 56(%rdi) + # Row 3 + movq -8(%rdi), %r12 + movq -24(%rdi), %r13 + movq -80(%rdi), %r14 + movq 64(%rdi), %r15 + movq 48(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -24(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -80(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 64(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 48(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -8(%rdi) + # Row 4 + xorq -32(%rdi), %r9 + xorq -88(%rdi), %r10 + xorq 96(%rdi), %r11 + xorq 40(%rdi), %rax + xorq -16(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -32(%rdi) + movq %r13, -88(%rdi) + movq %r14, 96(%rdi) + movq %r15, 40(%rdi) + movq %rbx, -16(%rdi) + # Round 10 + xorq %rcx, %r12 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r15 + xorq -64(%rdi), %rbx + xorq -56(%rdi), %r14 + xorq -48(%rdi), %r15 + xorq -40(%rdi), %rbx + xorq -24(%rdi), %r13 + xorq -8(%rdi), %r12 + xorq (%rdi), %r13 + xorq 8(%rdi), %r14 + xorq 16(%rdi), %r15 + xorq 24(%rdi), %r13 + xorq 32(%rdi), %r14 + xorq 48(%rdi), %rbx + xorq 56(%rdi), %r12 + xorq 64(%rdi), %r15 + xorq 72(%rdi), %rbx + xorq 80(%rdi), %r12 + xorq 88(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 24(%rdi), %r13 + movq -56(%rdi), %r14 + movq 64(%rdi), %r15 + movq -16(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 24(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -56(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 64(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -16(%rdi) + movq $0x80008009, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 16(%rdi), %r12 + movq -64(%rdi), %r13 + movq 56(%rdi), %r14 + movq -24(%rdi), %r15 + movq 96(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -64(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 56(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -24(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 96(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 16(%rdi) + # Row 2 + movq 88(%rdi), %r12 + movq 8(%rdi), %r13 + movq -72(%rdi), %r14 + movq 48(%rdi), %r15 + movq -32(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 8(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -72(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 48(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -32(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 88(%rdi) + # Row 3 + movq -40(%rdi), %r12 + movq 80(%rdi), %r13 + movq (%rdi), %r14 + movq -80(%rdi), %r15 + movq 40(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 80(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, (%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -80(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 40(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -40(%rdi) + # Row 4 + xorq 32(%rdi), %r9 + xorq -48(%rdi), %r10 + xorq 72(%rdi), %r11 + xorq -8(%rdi), %rax + xorq -88(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 32(%rdi) + movq %r13, -48(%rdi) + movq %r14, 72(%rdi) + movq %r15, -8(%rdi) + movq %rbx, -88(%rdi) + # Round 11 + xorq %rcx, %r12 + xorq -80(%rdi), %r15 + xorq -72(%rdi), %r14 + xorq -64(%rdi), %r13 + xorq -56(%rdi), %r14 + xorq -40(%rdi), %r12 + xorq -32(%rdi), %rbx + xorq -24(%rdi), %r15 + xorq -16(%rdi), %rbx + xorq (%rdi), %r14 + xorq 8(%rdi), %r13 + xorq 16(%rdi), %r12 + xorq 24(%rdi), %r13 + xorq 40(%rdi), %rbx + xorq 48(%rdi), %r15 + xorq 56(%rdi), %r14 + xorq 64(%rdi), %r15 + xorq 80(%rdi), %r13 + xorq 88(%rdi), %r12 + xorq 96(%rdi), %rbx + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -64(%rdi), %r13 + movq -72(%rdi), %r14 + movq -80(%rdi), %r15 + movq -88(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -64(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -72(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -80(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -88(%rdi) + movq $0x8000000a, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 64(%rdi), %r12 + movq 96(%rdi), %r13 + movq 88(%rdi), %r14 + movq 80(%rdi), %r15 + movq 72(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 96(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 88(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 80(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 72(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 64(%rdi) + # Row 2 + movq 24(%rdi), %r12 + movq 56(%rdi), %r13 + movq 48(%rdi), %r14 + movq 40(%rdi), %r15 + movq 32(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 56(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 48(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 40(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 32(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 24(%rdi) + # Row 3 + movq -16(%rdi), %r12 + movq 16(%rdi), %r13 + movq 8(%rdi), %r14 + movq (%rdi), %r15 + movq -8(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 16(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 8(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, (%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -8(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -16(%rdi) + # Row 4 + xorq -56(%rdi), %r9 + xorq -24(%rdi), %r10 + xorq -32(%rdi), %r11 + xorq -40(%rdi), %rax + xorq -48(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -56(%rdi) + movq %r13, -24(%rdi) + movq %r14, -32(%rdi) + movq %r15, -40(%rdi) + movq %rbx, -48(%rdi) + # Round 12 + xorq %rcx, %r12 + xorq -88(%rdi), %rbx + xorq -80(%rdi), %r15 + xorq -72(%rdi), %r14 + xorq -64(%rdi), %r13 + xorq -16(%rdi), %r12 + xorq -8(%rdi), %rbx + xorq (%rdi), %r15 + xorq 8(%rdi), %r14 + xorq 16(%rdi), %r13 + xorq 24(%rdi), %r12 + xorq 32(%rdi), %rbx + xorq 40(%rdi), %r15 + xorq 48(%rdi), %r14 + xorq 56(%rdi), %r13 + xorq 64(%rdi), %r12 + xorq 72(%rdi), %rbx + xorq 80(%rdi), %r15 + xorq 88(%rdi), %r14 + xorq 96(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 96(%rdi), %r13 + movq 48(%rdi), %r14 + movq (%rdi), %r15 + movq -48(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 96(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 48(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, (%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -48(%rdi) + movq $0x8000808b, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq -80(%rdi), %r12 + movq 72(%rdi), %r13 + movq 24(%rdi), %r14 + movq 16(%rdi), %r15 + movq -32(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 72(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 24(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 16(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -32(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -80(%rdi) + # Row 2 + movq -64(%rdi), %r12 + movq 88(%rdi), %r13 + movq 40(%rdi), %r14 + movq -8(%rdi), %r15 + movq -56(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 88(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 40(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -8(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -56(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -64(%rdi) + # Row 3 + movq -88(%rdi), %r12 + movq 64(%rdi), %r13 + movq 56(%rdi), %r14 + movq 8(%rdi), %r15 + movq -40(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 64(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 56(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 8(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -40(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -88(%rdi) + # Row 4 + xorq -72(%rdi), %r9 + xorq 80(%rdi), %r10 + xorq 32(%rdi), %r11 + xorq -16(%rdi), %rax + xorq -24(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -72(%rdi) + movq %r13, 80(%rdi) + movq %r14, 32(%rdi) + movq %r15, -16(%rdi) + movq %rbx, -24(%rdi) + # Round 13 + xorq %rcx, %r12 + xorq -88(%rdi), %r12 + xorq -80(%rdi), %r12 + xorq -64(%rdi), %r12 + xorq -56(%rdi), %rbx + xorq -48(%rdi), %rbx + xorq -40(%rdi), %rbx + xorq -32(%rdi), %rbx + xorq -8(%rdi), %r15 + xorq (%rdi), %r15 + xorq 8(%rdi), %r15 + xorq 16(%rdi), %r15 + xorq 24(%rdi), %r14 + xorq 40(%rdi), %r14 + xorq 48(%rdi), %r14 + xorq 56(%rdi), %r14 + xorq 64(%rdi), %r13 + xorq 72(%rdi), %r13 + xorq 88(%rdi), %r13 + xorq 96(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 72(%rdi), %r13 + movq 40(%rdi), %r14 + movq 8(%rdi), %r15 + movq -24(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 72(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 40(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 8(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -24(%rdi) + movq $0x800000000000008b, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq (%rdi), %r12 + movq -32(%rdi), %r13 + movq -64(%rdi), %r14 + movq 64(%rdi), %r15 + movq 32(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -32(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -64(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 64(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 32(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, (%rdi) + # Row 2 + movq 96(%rdi), %r12 + movq 24(%rdi), %r13 + movq -8(%rdi), %r14 + movq -40(%rdi), %r15 + movq -72(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 24(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -8(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -40(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -72(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 96(%rdi) + # Row 3 + movq -48(%rdi), %r12 + movq -80(%rdi), %r13 + movq 88(%rdi), %r14 + movq 56(%rdi), %r15 + movq -16(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -80(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 88(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 56(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -16(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -48(%rdi) + # Row 4 + xorq 48(%rdi), %r9 + xorq 16(%rdi), %r10 + xorq -56(%rdi), %r11 + xorq -88(%rdi), %rax + xorq 80(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 48(%rdi) + movq %r13, 16(%rdi) + movq %r14, -56(%rdi) + movq %r15, -88(%rdi) + movq %rbx, 80(%rdi) + # Round 14 + xorq %rcx, %r12 + xorq -80(%rdi), %r13 + xorq -72(%rdi), %rbx + xorq -64(%rdi), %r14 + xorq -48(%rdi), %r12 + xorq -40(%rdi), %r15 + xorq -32(%rdi), %r13 + xorq -24(%rdi), %rbx + xorq -16(%rdi), %rbx + xorq -8(%rdi), %r14 + xorq (%rdi), %r12 + xorq 8(%rdi), %r15 + xorq 24(%rdi), %r13 + xorq 32(%rdi), %rbx + xorq 40(%rdi), %r14 + xorq 56(%rdi), %r15 + xorq 64(%rdi), %r15 + xorq 72(%rdi), %r13 + xorq 88(%rdi), %r14 + xorq 96(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -32(%rdi), %r13 + movq -8(%rdi), %r14 + movq 56(%rdi), %r15 + movq 80(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -32(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -8(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 56(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 80(%rdi) + movq $0x8000000000008089, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 8(%rdi), %r12 + movq 32(%rdi), %r13 + movq 96(%rdi), %r14 + movq -80(%rdi), %r15 + movq -56(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 32(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 96(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -80(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -56(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 8(%rdi) + # Row 2 + movq 72(%rdi), %r12 + movq -64(%rdi), %r13 + movq -40(%rdi), %r14 + movq -16(%rdi), %r15 + movq 48(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -64(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -40(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -16(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 48(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 72(%rdi) + # Row 3 + movq -24(%rdi), %r12 + movq (%rdi), %r13 + movq 24(%rdi), %r14 + movq 88(%rdi), %r15 + movq -88(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, (%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 24(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 88(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -88(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -24(%rdi) + # Row 4 + xorq 40(%rdi), %r9 + xorq 64(%rdi), %r10 + xorq -72(%rdi), %r11 + xorq -48(%rdi), %rax + xorq 16(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 40(%rdi) + movq %r13, 64(%rdi) + movq %r14, -72(%rdi) + movq %r15, -48(%rdi) + movq %rbx, 16(%rdi) + # Round 15 + xorq %rcx, %r12 + xorq -88(%rdi), %rbx + xorq -80(%rdi), %r15 + xorq -64(%rdi), %r13 + xorq -56(%rdi), %rbx + xorq -40(%rdi), %r14 + xorq -32(%rdi), %r13 + xorq -24(%rdi), %r12 + xorq -16(%rdi), %r15 + xorq -8(%rdi), %r14 + xorq (%rdi), %r13 + xorq 8(%rdi), %r12 + xorq 24(%rdi), %r14 + xorq 32(%rdi), %r13 + xorq 48(%rdi), %rbx + xorq 56(%rdi), %r15 + xorq 72(%rdi), %r12 + xorq 80(%rdi), %rbx + xorq 88(%rdi), %r15 + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 32(%rdi), %r13 + movq -40(%rdi), %r14 + movq 88(%rdi), %r15 + movq 16(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 32(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -40(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 88(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 16(%rdi) + movq $0x8000000000008003, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 56(%rdi), %r12 + movq -56(%rdi), %r13 + movq 72(%rdi), %r14 + movq (%rdi), %r15 + movq -72(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -56(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 72(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, (%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -72(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 56(%rdi) + # Row 2 + movq -32(%rdi), %r12 + movq 96(%rdi), %r13 + movq -16(%rdi), %r14 + movq -88(%rdi), %r15 + movq 40(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 96(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -16(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -88(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 40(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -32(%rdi) + # Row 3 + movq 80(%rdi), %r12 + movq 8(%rdi), %r13 + movq -64(%rdi), %r14 + movq 24(%rdi), %r15 + movq -48(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 8(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -64(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 24(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -48(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 80(%rdi) + # Row 4 + xorq -8(%rdi), %r9 + xorq -80(%rdi), %r10 + xorq 48(%rdi), %r11 + xorq -24(%rdi), %rax + xorq 64(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -8(%rdi) + movq %r13, -80(%rdi) + movq %r14, 48(%rdi) + movq %r15, -24(%rdi) + movq %rbx, 64(%rdi) + # Round 16 + xorq %rcx, %r12 + xorq -88(%rdi), %r15 + xorq -72(%rdi), %rbx + xorq -64(%rdi), %r14 + xorq -56(%rdi), %r13 + xorq -48(%rdi), %rbx + xorq -40(%rdi), %r14 + xorq -32(%rdi), %r12 + xorq -16(%rdi), %r14 + xorq (%rdi), %r15 + xorq 8(%rdi), %r13 + xorq 16(%rdi), %rbx + xorq 24(%rdi), %r15 + xorq 32(%rdi), %r13 + xorq 40(%rdi), %rbx + xorq 56(%rdi), %r12 + xorq 72(%rdi), %r14 + xorq 80(%rdi), %r12 + xorq 88(%rdi), %r15 + xorq 96(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -56(%rdi), %r13 + movq -16(%rdi), %r14 + movq 24(%rdi), %r15 + movq 64(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -56(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -16(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 24(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 64(%rdi) + movq $0x8000000000008002, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 88(%rdi), %r12 + movq -72(%rdi), %r13 + movq -32(%rdi), %r14 + movq 8(%rdi), %r15 + movq 48(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -72(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -32(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 8(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 48(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 88(%rdi) + # Row 2 + movq 32(%rdi), %r12 + movq 72(%rdi), %r13 + movq -88(%rdi), %r14 + movq -48(%rdi), %r15 + movq -8(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 72(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -88(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -48(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -8(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 32(%rdi) + # Row 3 + movq 16(%rdi), %r12 + movq 56(%rdi), %r13 + movq 96(%rdi), %r14 + movq -64(%rdi), %r15 + movq -24(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 56(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 96(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -64(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -24(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 16(%rdi) + # Row 4 + xorq -40(%rdi), %r9 + xorq (%rdi), %r10 + xorq 40(%rdi), %r11 + xorq 80(%rdi), %rax + xorq -80(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -40(%rdi) + movq %r13, (%rdi) + movq %r14, 40(%rdi) + movq %r15, 80(%rdi) + movq %rbx, -80(%rdi) + # Round 17 + xorq %rcx, %r12 + xorq -88(%rdi), %r14 + xorq -72(%rdi), %r13 + xorq -64(%rdi), %r15 + xorq -56(%rdi), %r13 + xorq -48(%rdi), %r15 + xorq -32(%rdi), %r14 + xorq -24(%rdi), %rbx + xorq -16(%rdi), %r14 + xorq -8(%rdi), %rbx + xorq 8(%rdi), %r15 + xorq 16(%rdi), %r12 + xorq 24(%rdi), %r15 + xorq 32(%rdi), %r12 + xorq 48(%rdi), %rbx + xorq 56(%rdi), %r13 + xorq 64(%rdi), %rbx + xorq 72(%rdi), %r13 + xorq 88(%rdi), %r12 + xorq 96(%rdi), %r14 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -72(%rdi), %r13 + movq -88(%rdi), %r14 + movq -64(%rdi), %r15 + movq -80(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -72(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -88(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -64(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -80(%rdi) + movq $0x8000000000000080, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 24(%rdi), %r12 + movq 48(%rdi), %r13 + movq 32(%rdi), %r14 + movq 56(%rdi), %r15 + movq 40(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 48(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 32(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 56(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 40(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 24(%rdi) + # Row 2 + movq -56(%rdi), %r12 + movq -32(%rdi), %r13 + movq -48(%rdi), %r14 + movq -24(%rdi), %r15 + movq -40(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -32(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -48(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -24(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -40(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -56(%rdi) + # Row 3 + movq 64(%rdi), %r12 + movq 88(%rdi), %r13 + movq 72(%rdi), %r14 + movq 96(%rdi), %r15 + movq 80(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 88(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 72(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 96(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 80(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 64(%rdi) + # Row 4 + xorq -16(%rdi), %r9 + xorq 8(%rdi), %r10 + xorq -8(%rdi), %r11 + xorq 16(%rdi), %rax + xorq (%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -16(%rdi) + movq %r13, 8(%rdi) + movq %r14, -8(%rdi) + movq %r15, 16(%rdi) + movq %rbx, (%rdi) + # Round 18 + xorq %rcx, %r12 + xorq -88(%rdi), %r14 + xorq -80(%rdi), %rbx + xorq -72(%rdi), %r13 + xorq -64(%rdi), %r15 + xorq -56(%rdi), %r12 + xorq -48(%rdi), %r14 + xorq -40(%rdi), %rbx + xorq -32(%rdi), %r13 + xorq -24(%rdi), %r15 + xorq 24(%rdi), %r12 + xorq 32(%rdi), %r14 + xorq 40(%rdi), %rbx + xorq 48(%rdi), %r13 + xorq 56(%rdi), %r15 + xorq 64(%rdi), %r12 + xorq 72(%rdi), %r14 + xorq 80(%rdi), %rbx + xorq 88(%rdi), %r13 + xorq 96(%rdi), %r15 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 48(%rdi), %r13 + movq -48(%rdi), %r14 + movq 96(%rdi), %r15 + movq (%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 48(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -48(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 96(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, (%rdi) + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq $0x800a, %rcx + # Row 1 + movq -64(%rdi), %r12 + movq 40(%rdi), %r13 + movq -56(%rdi), %r14 + movq 88(%rdi), %r15 + movq -8(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 40(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -56(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 88(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -8(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -64(%rdi) + # Row 2 + movq -72(%rdi), %r12 + movq 32(%rdi), %r13 + movq -24(%rdi), %r14 + movq 80(%rdi), %r15 + movq -16(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 32(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -24(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 80(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -16(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -72(%rdi) + # Row 3 + movq -80(%rdi), %r12 + movq 24(%rdi), %r13 + movq -32(%rdi), %r14 + movq 72(%rdi), %r15 + movq 16(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 24(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -32(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 72(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 16(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -80(%rdi) + # Row 4 + xorq -88(%rdi), %r9 + xorq 56(%rdi), %r10 + xorq -40(%rdi), %r11 + xorq 64(%rdi), %rax + xorq 8(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -88(%rdi) + movq %r13, 56(%rdi) + movq %r14, -40(%rdi) + movq %r15, 64(%rdi) + movq %rbx, 8(%rdi) + # Round 19 + xorq %rcx, %r12 + xorq -80(%rdi), %r12 + xorq -72(%rdi), %r12 + xorq -64(%rdi), %r12 + xorq -56(%rdi), %r14 + xorq -48(%rdi), %r14 + xorq -32(%rdi), %r14 + xorq -24(%rdi), %r14 + xorq -16(%rdi), %rbx + xorq -8(%rdi), %rbx + xorq (%rdi), %rbx + xorq 16(%rdi), %rbx + xorq 24(%rdi), %r13 + xorq 32(%rdi), %r13 + xorq 40(%rdi), %r13 + xorq 48(%rdi), %r13 + xorq 72(%rdi), %r15 + xorq 80(%rdi), %r15 + xorq 88(%rdi), %r15 + xorq 96(%rdi), %r15 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq 40(%rdi), %r13 + movq -24(%rdi), %r14 + movq 72(%rdi), %r15 + movq 8(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 40(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -24(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 72(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 8(%rdi) + movq $0x800000008000000a, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 96(%rdi), %r12 + movq -8(%rdi), %r13 + movq -72(%rdi), %r14 + movq 24(%rdi), %r15 + movq -40(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -8(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -72(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 24(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -40(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 96(%rdi) + # Row 2 + movq 48(%rdi), %r12 + movq -56(%rdi), %r13 + movq 80(%rdi), %r14 + movq 16(%rdi), %r15 + movq -88(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -56(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 80(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 16(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -88(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 48(%rdi) + # Row 3 + movq (%rdi), %r12 + movq -64(%rdi), %r13 + movq 32(%rdi), %r14 + movq -32(%rdi), %r15 + movq 64(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -64(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 32(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -32(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 64(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, (%rdi) + # Row 4 + xorq -48(%rdi), %r9 + xorq 88(%rdi), %r10 + xorq -16(%rdi), %r11 + xorq -80(%rdi), %rax + xorq 56(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -48(%rdi) + movq %r13, 88(%rdi) + movq %r14, -16(%rdi) + movq %r15, -80(%rdi) + movq %rbx, 56(%rdi) + # Round 20 + xorq %rcx, %r12 + xorq -88(%rdi), %rbx + xorq -72(%rdi), %r14 + xorq -64(%rdi), %r13 + xorq -56(%rdi), %r13 + xorq -40(%rdi), %rbx + xorq -32(%rdi), %r15 + xorq -24(%rdi), %r14 + xorq -8(%rdi), %r13 + xorq (%rdi), %r12 + xorq 8(%rdi), %rbx + xorq 16(%rdi), %r15 + xorq 24(%rdi), %r15 + xorq 32(%rdi), %r14 + xorq 40(%rdi), %r13 + xorq 48(%rdi), %r12 + xorq 64(%rdi), %rbx + xorq 72(%rdi), %r15 + xorq 80(%rdi), %r14 + xorq 96(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -8(%rdi), %r13 + movq 80(%rdi), %r14 + movq -32(%rdi), %r15 + movq 56(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -8(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 80(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -32(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 56(%rdi) + movq $0x8000000080008081, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 72(%rdi), %r12 + movq -40(%rdi), %r13 + movq 48(%rdi), %r14 + movq -64(%rdi), %r15 + movq -16(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -40(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 48(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -64(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -16(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 72(%rdi) + # Row 2 + movq 40(%rdi), %r12 + movq -72(%rdi), %r13 + movq 16(%rdi), %r14 + movq 64(%rdi), %r15 + movq -48(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -72(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 16(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 64(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -48(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 40(%rdi) + # Row 3 + movq 8(%rdi), %r12 + movq 96(%rdi), %r13 + movq -56(%rdi), %r14 + movq 32(%rdi), %r15 + movq -80(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 96(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -56(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 32(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -80(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 8(%rdi) + # Row 4 + xorq -24(%rdi), %r9 + xorq 24(%rdi), %r10 + xorq -88(%rdi), %r11 + xorq (%rdi), %rax + xorq 88(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, -24(%rdi) + movq %r13, 24(%rdi) + movq %r14, -88(%rdi) + movq %r15, (%rdi) + movq %rbx, 88(%rdi) + # Round 21 + xorq %rcx, %r12 + xorq -80(%rdi), %rbx + xorq -72(%rdi), %r13 + xorq -64(%rdi), %r15 + xorq -56(%rdi), %r14 + xorq -48(%rdi), %rbx + xorq -40(%rdi), %r13 + xorq -32(%rdi), %r15 + xorq -16(%rdi), %rbx + xorq -8(%rdi), %r13 + xorq 8(%rdi), %r12 + xorq 16(%rdi), %r14 + xorq 32(%rdi), %r15 + xorq 40(%rdi), %r12 + xorq 48(%rdi), %r14 + xorq 56(%rdi), %rbx + xorq 64(%rdi), %r15 + xorq 72(%rdi), %r12 + xorq 80(%rdi), %r14 + xorq 96(%rdi), %r13 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -40(%rdi), %r13 + movq 16(%rdi), %r14 + movq 32(%rdi), %r15 + movq 88(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -40(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 16(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 32(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 88(%rdi) + movq $0x8000000000008080, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq -32(%rdi), %r12 + movq -16(%rdi), %r13 + movq 40(%rdi), %r14 + movq 96(%rdi), %r15 + movq -88(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -16(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 40(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 96(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -88(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -32(%rdi) + # Row 2 + movq -8(%rdi), %r12 + movq 48(%rdi), %r13 + movq 64(%rdi), %r14 + movq -80(%rdi), %r15 + movq -24(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 48(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 64(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -80(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -24(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -8(%rdi) + # Row 3 + movq 56(%rdi), %r12 + movq 72(%rdi), %r13 + movq -72(%rdi), %r14 + movq -56(%rdi), %r15 + movq (%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 72(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -72(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -56(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, (%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 56(%rdi) + # Row 4 + xorq 80(%rdi), %r9 + xorq -64(%rdi), %r10 + xorq -48(%rdi), %r11 + xorq 8(%rdi), %rax + xorq 24(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 80(%rdi) + movq %r13, -64(%rdi) + movq %r14, -48(%rdi) + movq %r15, 8(%rdi) + movq %rbx, 24(%rdi) + # Round 22 + xorq %rcx, %r12 + xorq -88(%rdi), %rbx + xorq -80(%rdi), %r15 + xorq -72(%rdi), %r14 + xorq -56(%rdi), %r15 + xorq -40(%rdi), %r13 + xorq -32(%rdi), %r12 + xorq -24(%rdi), %rbx + xorq -16(%rdi), %r13 + xorq -8(%rdi), %r12 + xorq (%rdi), %rbx + xorq 16(%rdi), %r14 + xorq 32(%rdi), %r15 + xorq 40(%rdi), %r14 + xorq 48(%rdi), %r13 + xorq 56(%rdi), %r12 + xorq 64(%rdi), %r14 + xorq 72(%rdi), %r13 + xorq 88(%rdi), %rbx + xorq 96(%rdi), %r15 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -16(%rdi), %r13 + movq 64(%rdi), %r14 + movq -56(%rdi), %r15 + movq 24(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -16(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 64(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -56(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 24(%rdi) + movq $0x80000001, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq 32(%rdi), %r12 + movq -88(%rdi), %r13 + movq -8(%rdi), %r14 + movq 72(%rdi), %r15 + movq -48(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -88(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -8(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 72(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -48(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 32(%rdi) + # Row 2 + movq -40(%rdi), %r12 + movq 40(%rdi), %r13 + movq -80(%rdi), %r14 + movq (%rdi), %r15 + movq 80(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 40(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -80(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, (%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 80(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -40(%rdi) + # Row 3 + movq 88(%rdi), %r12 + movq -32(%rdi), %r13 + movq 48(%rdi), %r14 + movq -72(%rdi), %r15 + movq 8(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -32(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 48(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -72(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 8(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 88(%rdi) + # Row 4 + xorq 16(%rdi), %r9 + xorq 96(%rdi), %r10 + xorq -24(%rdi), %r11 + xorq 56(%rdi), %rax + xorq -64(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 16(%rdi) + movq %r13, 96(%rdi) + movq %r14, -24(%rdi) + movq %r15, 56(%rdi) + movq %rbx, -64(%rdi) + # Round 23 + xorq %rcx, %r12 + xorq -88(%rdi), %r13 + xorq -80(%rdi), %r14 + xorq -72(%rdi), %r15 + xorq -56(%rdi), %r15 + xorq -48(%rdi), %rbx + xorq -40(%rdi), %r12 + xorq -32(%rdi), %r13 + xorq -16(%rdi), %r13 + xorq -8(%rdi), %r14 + xorq (%rdi), %r15 + xorq 8(%rdi), %rbx + xorq 24(%rdi), %rbx + xorq 32(%rdi), %r12 + xorq 40(%rdi), %r13 + xorq 48(%rdi), %r14 + xorq 64(%rdi), %r14 + xorq 72(%rdi), %r15 + xorq 80(%rdi), %rbx + xorq 88(%rdi), %r12 + # Calc t[0..4] + rorxq $63, %r13, %rax + rorxq $63, %r14, %r8 + rorxq $63, %r15, %r9 + rorxq $63, %rbx, %r10 + rorxq $63, %r12, %r11 + xorq %rbx, %rax + xorq %r12, %r8 + xorq %r13, %r9 + xorq %r14, %r10 + xorq %r15, %r11 + # Row Mix + # Row 0 + movq %rcx, %r12 + movq -88(%rdi), %r13 + movq -80(%rdi), %r14 + movq -72(%rdi), %r15 + movq -64(%rdi), %rbx + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + rolq $44, %r13 + rolq $43, %r14 + rolq $21, %r15 + rolq $14, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -88(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -80(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -72(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -64(%rdi) + movq $0x8000000080008008, %rbx + andnq %r14, %r13, %rcx + xorq %r12, %rcx + # XOR in constant + xorq %rbx, %rcx + # Row 1 + movq -56(%rdi), %r12 + movq -48(%rdi), %r13 + movq -40(%rdi), %r14 + movq -32(%rdi), %r15 + movq -24(%rdi), %rbx + xorq %r10, %r12 + xorq %r11, %r13 + xorq %rax, %r14 + xorq %r8, %r15 + xorq %r9, %rbx + rolq $28, %r12 + rolq $20, %r13 + rolq $3, %r14 + rolq $45, %r15 + rolq $61, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -48(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, -40(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, -32(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, -24(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -56(%rdi) + # Row 2 + movq -16(%rdi), %r12 + movq -8(%rdi), %r13 + movq (%rdi), %r14 + movq 8(%rdi), %r15 + movq 16(%rdi), %rbx + xorq %r8, %r12 + xorq %r9, %r13 + xorq %r10, %r14 + xorq %r11, %r15 + xorq %rax, %rbx + rolq $0x01, %r12 + rolq $6, %r13 + rolq $25, %r14 + rolq $8, %r15 + rolq $18, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, -8(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, (%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 8(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 16(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, -16(%rdi) + # Row 3 + movq 24(%rdi), %r12 + movq 32(%rdi), %r13 + movq 40(%rdi), %r14 + movq 48(%rdi), %r15 + movq 56(%rdi), %rbx + xorq %r11, %r12 + xorq %rax, %r13 + xorq %r8, %r14 + xorq %r9, %r15 + xorq %r10, %rbx + rolq $27, %r12 + rolq $36, %r13 + rolq $10, %r14 + rolq $15, %r15 + rolq $56, %rbx + andnq %r15, %r14, %rbp + xorq %r13, %rbp + movq %rbp, 32(%rdi) + andnq %rbx, %r15, %rbp + xorq %r14, %rbp + movq %rbp, 40(%rdi) + andnq %r12, %rbx, %rbp + xorq %r15, %rbp + movq %rbp, 48(%rdi) + andnq %r13, %r12, %rbp + xorq %rbx, %rbp + movq %rbp, 56(%rdi) + andnq %r14, %r13, %rbp + xorq %r12, %rbp + movq %rbp, 24(%rdi) + # Row 4 + xorq 64(%rdi), %r9 + xorq 72(%rdi), %r10 + xorq 80(%rdi), %r11 + xorq 88(%rdi), %rax + xorq 96(%rdi), %r8 + rorxq $2, %r9, %r12 + rorxq $9, %r10, %r13 + rorxq $25, %r11, %r14 + rorxq $23, %rax, %r15 + rorxq $62, %r8, %rbx + andnq %r14, %r13, %rax + andnq %r15, %r14, %r8 + andnq %rbx, %r15, %r9 + andnq %r12, %rbx, %r10 + andnq %r13, %r12, %r11 + xorq %rax, %r12 + xorq %r8, %r13 + xorq %r9, %r14 + xorq %r10, %r15 + xorq %r11, %rbx + movq %r12, 64(%rdi) + movq %r13, 72(%rdi) + movq %r14, 80(%rdi) + movq %r15, 88(%rdi) + movq %rbx, 96(%rdi) + addq (%rsp), %rsi + subl $0x01, %edx + movq (%rsp), %rbp + jg L_sha3_block_n_bmi2_start + movq %rcx, -96(%rdi) + popq %rbp + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sha3_block_n_bmi2,.-sha3_block_n_bmi2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_sha3_block_avx2_rotl: +.quad 0x1,0x3e +.quad 0x1c,0x1b +.quad 0x2c,0x6 +.quad 0x37,0x14 +.quad 0xa,0x2b +.quad 0x19,0x27 +.quad 0x2d,0xf +.quad 0x15,0x8 +.quad 0x24,0x3 +.quad 0x29,0x12 +.quad 0x2,0x3d +.quad 0x38,0xe +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 16 +#else +.p2align 4 +#endif /* __APPLE__ */ +L_sha3_block_avx2_rotr: +.quad 0x3f,0x2 +.quad 0x24,0x25 +.quad 0x14,0x3a +.quad 0x9,0x2c +.quad 0x36,0x15 +.quad 0x27,0x19 +.quad 0x13,0x31 +.quad 0x2b,0x38 +.quad 0x1c,0x3d +.quad 0x17,0x2e +.quad 0x3e,0x3 +.quad 0x8,0x32 +#ifndef __APPLE__ +.text +.globl sha3_block_avx2 +.type sha3_block_avx2,@function +.align 16 +sha3_block_avx2: +#else +.section __TEXT,__text +.globl _sha3_block_avx2 +.p2align 4 +_sha3_block_avx2: +#endif /* __APPLE__ */ + leaq L_sha3_avx2_r(%rip), %rdx + leaq L_sha3_block_avx2_rotl(%rip), %rax + addq $0x40, %rax + leaq L_sha3_block_avx2_rotr(%rip), %rcx + addq $0x40, %rcx + movq $24, %r8 + vpbroadcastq (%rdi), %ymm0 + vmovdqu 8(%rdi), %ymm1 + vmovdqu 40(%rdi), %ymm2 + vmovdqu 72(%rdi), %ymm3 + vmovdqu 104(%rdi), %ymm4 + vmovdqu 136(%rdi), %ymm5 + vmovdqu 168(%rdi), %ymm6 + vpermq $57, %ymm2, %ymm7 + vpermq $30, %ymm3, %ymm8 + vpermq $0x4b, %ymm4, %ymm9 + vpermq $0x93, %ymm5, %ymm10 + vpblendd $12, %ymm3, %ymm2, %ymm11 + vpblendd $0xc0, %ymm5, %ymm4, %ymm12 + vpblendd $0xc0, %ymm8, %ymm7, %ymm2 + vpblendd $0xf0, %ymm9, %ymm8, %ymm3 + vpblendd $3, %ymm9, %ymm10, %ymm4 + vpblendd $0xf0, %ymm12, %ymm11, %ymm5 +L_sha3_block_avx2_start: + # Calc b[0..4] + vpshufd $0xee, %ymm5, %ymm7 + vpxor %ymm7, %ymm5, %ymm14 + vpxor %ymm2, %ymm1, %ymm15 + vpermq $0xaa, %ymm14, %ymm7 + vpxor %ymm0, %ymm14, %ymm14 + vpxor %ymm4, %ymm3, %ymm12 + vpxor %ymm7, %ymm14, %ymm14 + vpermq $0x00, %ymm14, %ymm14 + vpxor %ymm6, %ymm15, %ymm15 + vpxor %ymm12, %ymm15, %ymm15 + # XOR in b[x+4] + vpermq $0x93, %ymm15, %ymm7 + vpermq $57, %ymm15, %ymm9 + vpermq $0xff, %ymm15, %ymm8 + vpermq $0x00, %ymm15, %ymm10 + vpblendd $3, %ymm14, %ymm7, %ymm7 + vpblendd $0xc0, %ymm14, %ymm9, %ymm9 + vpxor %ymm8, %ymm0, %ymm0 + vpxor %ymm7, %ymm1, %ymm1 + vpxor %ymm7, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vpxor %ymm7, %ymm4, %ymm4 + vpxor %ymm8, %ymm5, %ymm5 + vpxor %ymm7, %ymm6, %ymm6 + # Rotate left 1 + vpsrlq $63, %ymm9, %ymm7 + vpsrlq $63, %ymm10, %ymm8 + vpaddq %ymm9, %ymm9, %ymm9 + vpaddq %ymm10, %ymm10, %ymm10 + vpor %ymm7, %ymm9, %ymm9 + vpor %ymm8, %ymm10, %ymm10 + # XOR in ROTL64(b[x+1]) + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm9, %ymm1, %ymm1 + vpxor %ymm9, %ymm2, %ymm2 + vpxor %ymm9, %ymm3, %ymm3 + vpxor %ymm9, %ymm4, %ymm4 + vpxor %ymm10, %ymm5, %ymm5 + vpxor %ymm9, %ymm6, %ymm6 + # Shuffle - Rotate + vpsrlvq -64(%rcx), %ymm1, %ymm8 + vpsrlvq -32(%rcx), %ymm2, %ymm9 + vpsrlvq (%rcx), %ymm3, %ymm10 + vpsrlvq 32(%rcx), %ymm4, %ymm11 + vpsrlvq 64(%rcx), %ymm5, %ymm12 + vpsrlvq 96(%rcx), %ymm6, %ymm13 + vpsllvq -64(%rax), %ymm1, %ymm1 + vpsllvq -32(%rax), %ymm2, %ymm2 + vpsllvq (%rax), %ymm3, %ymm3 + vpsllvq 32(%rax), %ymm4, %ymm4 + vpsllvq 64(%rax), %ymm5, %ymm5 + vpsllvq 96(%rax), %ymm6, %ymm6 + vpor %ymm8, %ymm1, %ymm1 + vpor %ymm9, %ymm2, %ymm2 + vpor %ymm10, %ymm3, %ymm3 + vpor %ymm11, %ymm4, %ymm4 + vpor %ymm12, %ymm5, %ymm5 + vpor %ymm13, %ymm6, %ymm6 + # Row Mix + vpermq $0x00, %ymm2, %ymm12 + vpermq $0x55, %ymm3, %ymm13 + vpermq $0xaa, %ymm4, %ymm14 + vpermq $0xff, %ymm6, %ymm15 + vpandn %ymm14, %ymm13, %ymm7 + vpandn %ymm15, %ymm14, %ymm8 + vpandn %ymm0, %ymm15, %ymm9 + vpandn %ymm12, %ymm0, %ymm10 + vpandn %ymm13, %ymm12, %ymm11 + vpxor %ymm7, %ymm12, %ymm12 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm0, %ymm0 + vpermq $0x8d, %ymm5, %ymm7 + vpblendd $12, %ymm13, %ymm12, %ymm10 + vpermq $0x72, %ymm1, %ymm11 + vpblendd $0xc0, %ymm15, %ymm14, %ymm9 + vpermq $0x87, %ymm2, %ymm12 + vpblendd $0xf0, %ymm9, %ymm10, %ymm1 + vpermq $0xc9, %ymm3, %ymm13 + vpermq $0x9c, %ymm4, %ymm14 + vpermq $45, %ymm6, %ymm15 + vpblendd $48, %ymm7, %ymm12, %ymm12 + vpblendd $3, %ymm7, %ymm13, %ymm13 + vpblendd $0xc0, %ymm7, %ymm14, %ymm14 + vpblendd $12, %ymm7, %ymm15, %ymm15 + vpandn %ymm13, %ymm12, %ymm5 + vpandn %ymm14, %ymm13, %ymm7 + vpandn %ymm15, %ymm14, %ymm2 + vpandn %ymm11, %ymm15, %ymm3 + vpandn %ymm12, %ymm11, %ymm4 + vpxor %ymm5, %ymm11, %ymm5 + vpxor %ymm7, %ymm12, %ymm12 + vpxor %ymm2, %ymm13, %ymm13 + vpxor %ymm3, %ymm14, %ymm14 + vpxor %ymm4, %ymm15, %ymm15 + vpunpcklqdq %ymm13, %ymm12, %ymm2 + vpunpckhqdq %ymm13, %ymm12, %ymm3 + vpunpcklqdq %ymm15, %ymm14, %ymm7 + vpunpckhqdq %ymm15, %ymm14, %ymm8 + vperm2i128 $49, %ymm7, %ymm2, %ymm4 + vperm2i128 $49, %ymm8, %ymm3, %ymm6 + vperm2i128 $32, %ymm7, %ymm2, %ymm2 + vperm2i128 $32, %ymm8, %ymm3, %ymm3 + vpxor (%rdx), %ymm0, %ymm0 + addq $32, %rdx + subq $0x01, %r8 + jnz L_sha3_block_avx2_start + vpermq $0x93, %ymm2, %ymm7 + vpermq $0x4e, %ymm3, %ymm8 + vpermq $57, %ymm4, %ymm9 + vpblendd $3, %ymm5, %ymm7, %ymm2 + vpblendd $3, %ymm7, %ymm8, %ymm3 + vpblendd $12, %ymm5, %ymm3, %ymm3 + vpblendd $0xc0, %ymm9, %ymm8, %ymm4 + vpblendd $48, %ymm5, %ymm4, %ymm4 + vpblendd $0xc0, %ymm5, %ymm9, %ymm5 + vmovq %xmm0, (%rdi) + vmovdqu %ymm1, 8(%rdi) + vmovdqu %ymm2, 40(%rdi) + vmovdqu %ymm3, 72(%rdi) + vmovdqu %ymm4, 104(%rdi) + vmovdqu %ymm5, 136(%rdi) + vmovdqu %ymm6, 168(%rdi) + vzeroupper + repz retq +#ifndef __APPLE__ +.size sha3_block_avx2,.-sha3_block_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_sha3_blocksx4_avx2 +.type kyber_sha3_blocksx4_avx2,@function +.align 16 +kyber_sha3_blocksx4_avx2: +#else +.section __TEXT,__text +.globl _kyber_sha3_blocksx4_avx2 +.p2align 4 +_kyber_sha3_blocksx4_avx2: +#endif /* __APPLE__ */ + leaq L_sha3_x4_avx2_r(%rip), %rdx + vmovdqu (%rdi), %ymm15 + movq %rdi, %rax + movq %rdi, %rcx + addq $0x80, %rdi + addq $0x180, %rax + addq $0x280, %rcx + # Round 0 + # Calc b[0..4] + vmovdqu -96(%rdi), %ymm11 + vmovdqu -64(%rdi), %ymm12 + vmovdqu -32(%rdi), %ymm13 + vmovdqu (%rdi), %ymm14 + vpxor 32(%rdi), %ymm15, %ymm10 + vpxor 64(%rdi), %ymm11, %ymm11 + vpxor 96(%rdi), %ymm12, %ymm12 + vpxor 128(%rdi), %ymm13, %ymm13 + vpxor -96(%rax), %ymm14, %ymm14 + vpxor -64(%rax), %ymm10, %ymm10 + vpxor -32(%rax), %ymm11, %ymm11 + vpxor (%rax), %ymm12, %ymm12 + vpxor 32(%rax), %ymm13, %ymm13 + vpxor 64(%rax), %ymm14, %ymm14 + vpxor 96(%rax), %ymm10, %ymm10 + vpxor 128(%rax), %ymm11, %ymm11 + vpxor -96(%rcx), %ymm12, %ymm12 + vpxor -64(%rcx), %ymm13, %ymm13 + vpxor -32(%rcx), %ymm14, %ymm14 + vpxor (%rcx), %ymm10, %ymm10 + vpxor 32(%rcx), %ymm11, %ymm11 + vpxor 64(%rcx), %ymm12, %ymm12 + vpxor 96(%rcx), %ymm13, %ymm13 + vpxor 128(%rcx), %ymm14, %ymm14 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 64(%rdi), %ymm6, %ymm11 + vpxor (%rax), %ymm7, %ymm12 + vpxor -64(%rcx), %ymm8, %ymm13 + vpxor 128(%rcx), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor (%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, (%rax) + vmovdqu %ymm3, -64(%rcx) + vmovdqu %ymm4, 128(%rcx) + # Row 1 + vpxor -32(%rdi), %ymm8, %ymm10 + vpxor -96(%rax), %ymm9, %ymm11 + vpxor -64(%rax), %ymm5, %ymm12 + vpxor 128(%rax), %ymm6, %ymm13 + vpxor 64(%rcx), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rdi) + vmovdqu %ymm1, -96(%rax) + vmovdqu %ymm2, -64(%rax) + vmovdqu %ymm3, 128(%rax) + vmovdqu %ymm4, 64(%rcx) + # Row 2 + vpxor -96(%rdi), %ymm6, %ymm10 + vpxor 96(%rdi), %ymm7, %ymm11 + vpxor 32(%rax), %ymm8, %ymm12 + vpxor -32(%rcx), %ymm9, %ymm13 + vpxor (%rcx), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rdi) + vmovdqu %ymm1, 96(%rdi) + vmovdqu %ymm2, 32(%rax) + vmovdqu %ymm3, -32(%rcx) + vmovdqu %ymm4, (%rcx) + # Row 3 + vpxor (%rdi), %ymm9, %ymm10 + vpxor 32(%rdi), %ymm5, %ymm11 + vpxor -32(%rax), %ymm6, %ymm12 + vpxor -96(%rcx), %ymm7, %ymm13 + vpxor 96(%rcx), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, -32(%rax) + vmovdqu %ymm3, -96(%rcx) + vmovdqu %ymm4, 96(%rcx) + # Row 4 + vpxor -64(%rdi), %ymm7, %ymm10 + vpxor 128(%rdi), %ymm8, %ymm11 + vpxor 64(%rax), %ymm9, %ymm12 + vpxor 96(%rax), %ymm5, %ymm13 + vpxor 32(%rcx), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rdi) + vmovdqu %ymm1, 128(%rdi) + vmovdqu %ymm2, 64(%rax) + vmovdqu %ymm3, 96(%rax) + vmovdqu %ymm4, 32(%rcx) + # Round 1 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm10, %ymm10 + vpxor -32(%rdi), %ymm10, %ymm10 + vpxor (%rdi), %ymm10, %ymm10 + vpxor 32(%rdi), %ymm1, %ymm11 + vpxor 64(%rdi), %ymm11, %ymm11 + vpxor 96(%rdi), %ymm11, %ymm11 + vpxor -96(%rax), %ymm11, %ymm11 + vpxor -64(%rax), %ymm2, %ymm12 + vpxor -32(%rax), %ymm12, %ymm12 + vpxor (%rax), %ymm12, %ymm12 + vpxor 32(%rax), %ymm12, %ymm12 + vpxor 128(%rax), %ymm3, %ymm13 + vpxor -96(%rcx), %ymm13, %ymm13 + vpxor -64(%rcx), %ymm13, %ymm13 + vpxor -32(%rcx), %ymm13, %ymm13 + vpxor (%rcx), %ymm4, %ymm14 + vpxor 64(%rcx), %ymm14, %ymm14 + vpxor 96(%rcx), %ymm14, %ymm14 + vpxor 128(%rcx), %ymm14, %ymm14 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -96(%rax), %ymm6, %ymm11 + vpxor 32(%rax), %ymm7, %ymm12 + vpxor -96(%rcx), %ymm8, %ymm13 + vpxor 32(%rcx), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 32(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -96(%rax) + vmovdqu %ymm2, 32(%rax) + vmovdqu %ymm3, -96(%rcx) + vmovdqu %ymm4, 32(%rcx) + # Row 1 + vpxor -64(%rcx), %ymm8, %ymm10 + vpxor 64(%rcx), %ymm9, %ymm11 + vpxor -96(%rdi), %ymm5, %ymm12 + vpxor 32(%rdi), %ymm6, %ymm13 + vpxor 64(%rax), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rcx) + vmovdqu %ymm1, 64(%rcx) + vmovdqu %ymm2, -96(%rdi) + vmovdqu %ymm3, 32(%rdi) + vmovdqu %ymm4, 64(%rax) + # Row 2 + vpxor 64(%rdi), %ymm6, %ymm10 + vpxor -64(%rax), %ymm7, %ymm11 + vpxor -32(%rcx), %ymm8, %ymm12 + vpxor 96(%rcx), %ymm9, %ymm13 + vpxor -64(%rdi), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm1, -64(%rax) + vmovdqu %ymm2, -32(%rcx) + vmovdqu %ymm3, 96(%rcx) + vmovdqu %ymm4, -64(%rdi) + # Row 3 + vpxor 128(%rcx), %ymm9, %ymm10 + vpxor -32(%rdi), %ymm5, %ymm11 + vpxor 96(%rdi), %ymm6, %ymm12 + vpxor -32(%rax), %ymm7, %ymm13 + vpxor 96(%rax), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rcx) + vmovdqu %ymm1, -32(%rdi) + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, -32(%rax) + vmovdqu %ymm4, 96(%rax) + # Row 4 + vpxor (%rax), %ymm7, %ymm10 + vpxor 128(%rax), %ymm8, %ymm11 + vpxor (%rcx), %ymm9, %ymm12 + vpxor (%rdi), %ymm5, %ymm13 + vpxor 128(%rdi), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rax) + vmovdqu %ymm1, 128(%rax) + vmovdqu %ymm2, (%rcx) + vmovdqu %ymm3, (%rdi) + vmovdqu %ymm4, 128(%rdi) + # Round 2 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm2, %ymm12 + vpxor -64(%rdi), %ymm4, %ymm14 + vpxor -32(%rdi), %ymm1, %ymm11 + vpxor 32(%rdi), %ymm3, %ymm13 + vpxor 64(%rdi), %ymm10, %ymm10 + vpxor 96(%rdi), %ymm12, %ymm12 + vpxor -96(%rax), %ymm11, %ymm11 + vpxor -64(%rax), %ymm11, %ymm11 + vpxor -32(%rax), %ymm13, %ymm13 + vpxor 32(%rax), %ymm12, %ymm12 + vpxor 64(%rax), %ymm14, %ymm14 + vpxor 96(%rax), %ymm14, %ymm14 + vpxor -96(%rcx), %ymm13, %ymm13 + vpxor -64(%rcx), %ymm10, %ymm10 + vpxor -32(%rcx), %ymm12, %ymm12 + vpxor 32(%rcx), %ymm14, %ymm14 + vpxor 64(%rcx), %ymm11, %ymm11 + vpxor 96(%rcx), %ymm13, %ymm13 + vpxor 128(%rcx), %ymm10, %ymm10 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 64(%rcx), %ymm6, %ymm11 + vpxor -32(%rcx), %ymm7, %ymm12 + vpxor -32(%rax), %ymm8, %ymm13 + vpxor 128(%rdi), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 64(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 64(%rcx) + vmovdqu %ymm2, -32(%rcx) + vmovdqu %ymm3, -32(%rax) + vmovdqu %ymm4, 128(%rdi) + # Row 1 + vpxor -96(%rcx), %ymm8, %ymm10 + vpxor 64(%rax), %ymm9, %ymm11 + vpxor 64(%rdi), %ymm5, %ymm12 + vpxor -32(%rdi), %ymm6, %ymm13 + vpxor (%rcx), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rcx) + vmovdqu %ymm1, 64(%rax) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, -32(%rdi) + vmovdqu %ymm4, (%rcx) + # Row 2 + vpxor -96(%rax), %ymm6, %ymm10 + vpxor -96(%rdi), %ymm7, %ymm11 + vpxor 96(%rcx), %ymm8, %ymm12 + vpxor 96(%rax), %ymm9, %ymm13 + vpxor (%rax), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rax) + vmovdqu %ymm1, -96(%rdi) + vmovdqu %ymm2, 96(%rcx) + vmovdqu %ymm3, 96(%rax) + vmovdqu %ymm4, (%rax) + # Row 3 + vpxor 32(%rcx), %ymm9, %ymm10 + vpxor -64(%rcx), %ymm5, %ymm11 + vpxor -64(%rax), %ymm6, %ymm12 + vpxor 96(%rdi), %ymm7, %ymm13 + vpxor (%rdi), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rcx) + vmovdqu %ymm1, -64(%rcx) + vmovdqu %ymm2, -64(%rax) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, (%rdi) + # Row 4 + vpxor 32(%rax), %ymm7, %ymm10 + vpxor 32(%rdi), %ymm8, %ymm11 + vpxor -64(%rdi), %ymm9, %ymm12 + vpxor 128(%rcx), %ymm5, %ymm13 + vpxor 128(%rax), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rax) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, -64(%rdi) + vmovdqu %ymm3, 128(%rcx) + vmovdqu %ymm4, 128(%rax) + # Round 3 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm1, %ymm11 + vpxor -32(%rdi), %ymm3, %ymm13 + vpxor (%rdi), %ymm4, %ymm14 + vpxor 64(%rdi), %ymm2, %ymm12 + vpxor 96(%rdi), %ymm13, %ymm13 + vpxor 128(%rdi), %ymm14, %ymm14 + vpxor -96(%rax), %ymm10, %ymm10 + vpxor -64(%rax), %ymm12, %ymm12 + vpxor -32(%rax), %ymm13, %ymm13 + vpxor (%rax), %ymm14, %ymm14 + vpxor 64(%rax), %ymm11, %ymm11 + vpxor 96(%rax), %ymm13, %ymm13 + vpxor -96(%rcx), %ymm10, %ymm10 + vpxor -64(%rcx), %ymm11, %ymm11 + vpxor -32(%rcx), %ymm12, %ymm12 + vpxor (%rcx), %ymm14, %ymm14 + vpxor 32(%rcx), %ymm10, %ymm10 + vpxor 64(%rcx), %ymm11, %ymm11 + vpxor 96(%rcx), %ymm12, %ymm12 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 64(%rax), %ymm6, %ymm11 + vpxor 96(%rcx), %ymm7, %ymm12 + vpxor 96(%rdi), %ymm8, %ymm13 + vpxor 128(%rax), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 96(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 64(%rax) + vmovdqu %ymm2, 96(%rcx) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, 128(%rax) + # Row 1 + vpxor -32(%rax), %ymm8, %ymm10 + vpxor (%rcx), %ymm9, %ymm11 + vpxor -96(%rax), %ymm5, %ymm12 + vpxor -64(%rcx), %ymm6, %ymm13 + vpxor -64(%rdi), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rax) + vmovdqu %ymm1, (%rcx) + vmovdqu %ymm2, -96(%rax) + vmovdqu %ymm3, -64(%rcx) + vmovdqu %ymm4, -64(%rdi) + # Row 2 + vpxor 64(%rcx), %ymm6, %ymm10 + vpxor 64(%rdi), %ymm7, %ymm11 + vpxor 96(%rax), %ymm8, %ymm12 + vpxor (%rdi), %ymm9, %ymm13 + vpxor 32(%rax), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rcx) + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, 96(%rax) + vmovdqu %ymm3, (%rdi) + vmovdqu %ymm4, 32(%rax) + # Row 3 + vpxor 128(%rdi), %ymm9, %ymm10 + vpxor -96(%rcx), %ymm5, %ymm11 + vpxor -96(%rdi), %ymm6, %ymm12 + vpxor -64(%rax), %ymm7, %ymm13 + vpxor 128(%rcx), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rdi) + vmovdqu %ymm1, -96(%rcx) + vmovdqu %ymm2, -96(%rdi) + vmovdqu %ymm3, -64(%rax) + vmovdqu %ymm4, 128(%rcx) + # Row 4 + vpxor -32(%rcx), %ymm7, %ymm10 + vpxor -32(%rdi), %ymm8, %ymm11 + vpxor (%rax), %ymm9, %ymm12 + vpxor 32(%rcx), %ymm5, %ymm13 + vpxor 32(%rdi), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rcx) + vmovdqu %ymm1, -32(%rdi) + vmovdqu %ymm2, (%rax) + vmovdqu %ymm3, 32(%rcx) + vmovdqu %ymm4, 32(%rdi) + # Round 4 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm2, %ymm12 + vpxor -64(%rdi), %ymm4, %ymm14 + vpxor (%rdi), %ymm3, %ymm13 + vpxor 64(%rdi), %ymm1, %ymm11 + vpxor 96(%rdi), %ymm13, %ymm13 + vpxor 128(%rdi), %ymm10, %ymm10 + vpxor -96(%rax), %ymm12, %ymm12 + vpxor -64(%rax), %ymm13, %ymm13 + vpxor -32(%rax), %ymm10, %ymm10 + vpxor 32(%rax), %ymm14, %ymm14 + vpxor 64(%rax), %ymm11, %ymm11 + vpxor 96(%rax), %ymm12, %ymm12 + vpxor 128(%rax), %ymm14, %ymm14 + vpxor -96(%rcx), %ymm11, %ymm11 + vpxor -64(%rcx), %ymm13, %ymm13 + vpxor (%rcx), %ymm11, %ymm11 + vpxor 64(%rcx), %ymm10, %ymm10 + vpxor 96(%rcx), %ymm12, %ymm12 + vpxor 128(%rcx), %ymm14, %ymm14 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor (%rcx), %ymm6, %ymm11 + vpxor 96(%rax), %ymm7, %ymm12 + vpxor -64(%rax), %ymm8, %ymm13 + vpxor 32(%rdi), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 128(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, (%rcx) + vmovdqu %ymm2, 96(%rax) + vmovdqu %ymm3, -64(%rax) + vmovdqu %ymm4, 32(%rdi) + # Row 1 + vpxor 96(%rdi), %ymm8, %ymm10 + vpxor -64(%rdi), %ymm9, %ymm11 + vpxor 64(%rcx), %ymm5, %ymm12 + vpxor -96(%rcx), %ymm6, %ymm13 + vpxor (%rax), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm1, -64(%rdi) + vmovdqu %ymm2, 64(%rcx) + vmovdqu %ymm3, -96(%rcx) + vmovdqu %ymm4, (%rax) + # Row 2 + vpxor 64(%rax), %ymm6, %ymm10 + vpxor -96(%rax), %ymm7, %ymm11 + vpxor (%rdi), %ymm8, %ymm12 + vpxor 128(%rcx), %ymm9, %ymm13 + vpxor -32(%rcx), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rax) + vmovdqu %ymm1, -96(%rax) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm3, 128(%rcx) + vmovdqu %ymm4, -32(%rcx) + # Row 3 + vpxor 128(%rax), %ymm9, %ymm10 + vpxor -32(%rax), %ymm5, %ymm11 + vpxor 64(%rdi), %ymm6, %ymm12 + vpxor -96(%rdi), %ymm7, %ymm13 + vpxor 32(%rcx), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rax) + vmovdqu %ymm1, -32(%rax) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, -96(%rdi) + vmovdqu %ymm4, 32(%rcx) + # Row 4 + vpxor 96(%rcx), %ymm7, %ymm10 + vpxor -64(%rcx), %ymm8, %ymm11 + vpxor 32(%rax), %ymm9, %ymm12 + vpxor 128(%rdi), %ymm5, %ymm13 + vpxor -32(%rdi), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rcx) + vmovdqu %ymm1, -64(%rcx) + vmovdqu %ymm2, 32(%rax) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, -32(%rdi) + # Round 5 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm3, %ymm13 + vpxor -64(%rdi), %ymm1, %ymm11 + vpxor (%rdi), %ymm2, %ymm12 + vpxor 32(%rdi), %ymm4, %ymm14 + vpxor 64(%rdi), %ymm12, %ymm12 + vpxor 96(%rdi), %ymm10, %ymm10 + vpxor -96(%rax), %ymm11, %ymm11 + vpxor -64(%rax), %ymm13, %ymm13 + vpxor -32(%rax), %ymm11, %ymm11 + vpxor (%rax), %ymm14, %ymm14 + vpxor 64(%rax), %ymm10, %ymm10 + vpxor 96(%rax), %ymm12, %ymm12 + vpxor 128(%rax), %ymm10, %ymm10 + vpxor -96(%rcx), %ymm13, %ymm13 + vpxor -32(%rcx), %ymm14, %ymm14 + vpxor (%rcx), %ymm11, %ymm11 + vpxor 32(%rcx), %ymm14, %ymm14 + vpxor 64(%rcx), %ymm12, %ymm12 + vpxor 128(%rcx), %ymm13, %ymm13 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -64(%rdi), %ymm6, %ymm11 + vpxor (%rdi), %ymm7, %ymm12 + vpxor -96(%rdi), %ymm8, %ymm13 + vpxor -32(%rdi), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 160(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -64(%rdi) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm3, -96(%rdi) + vmovdqu %ymm4, -32(%rdi) + # Row 1 + vpxor -64(%rax), %ymm8, %ymm10 + vpxor (%rax), %ymm9, %ymm11 + vpxor 64(%rax), %ymm5, %ymm12 + vpxor -32(%rax), %ymm6, %ymm13 + vpxor 32(%rax), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rax) + vmovdqu %ymm1, (%rax) + vmovdqu %ymm2, 64(%rax) + vmovdqu %ymm3, -32(%rax) + vmovdqu %ymm4, 32(%rax) + # Row 2 + vpxor (%rcx), %ymm6, %ymm10 + vpxor 64(%rcx), %ymm7, %ymm11 + vpxor 128(%rcx), %ymm8, %ymm12 + vpxor 32(%rcx), %ymm9, %ymm13 + vpxor 96(%rcx), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rcx) + vmovdqu %ymm1, 64(%rcx) + vmovdqu %ymm2, 128(%rcx) + vmovdqu %ymm3, 32(%rcx) + vmovdqu %ymm4, 96(%rcx) + # Row 3 + vpxor 32(%rdi), %ymm9, %ymm10 + vpxor 96(%rdi), %ymm5, %ymm11 + vpxor -96(%rax), %ymm6, %ymm12 + vpxor 64(%rdi), %ymm7, %ymm13 + vpxor 128(%rdi), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rdi) + vmovdqu %ymm1, 96(%rdi) + vmovdqu %ymm2, -96(%rax) + vmovdqu %ymm3, 64(%rdi) + vmovdqu %ymm4, 128(%rdi) + # Row 4 + vpxor 96(%rax), %ymm7, %ymm10 + vpxor -96(%rcx), %ymm8, %ymm11 + vpxor -32(%rcx), %ymm9, %ymm12 + vpxor 128(%rax), %ymm5, %ymm13 + vpxor -64(%rcx), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rax) + vmovdqu %ymm1, -96(%rcx) + vmovdqu %ymm2, -32(%rcx) + vmovdqu %ymm3, 128(%rax) + vmovdqu %ymm4, -64(%rcx) + # Round 6 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm3, %ymm13 + vpxor -64(%rdi), %ymm1, %ymm11 + vpxor -32(%rdi), %ymm4, %ymm14 + vpxor (%rdi), %ymm2, %ymm12 + vpxor 32(%rdi), %ymm10, %ymm10 + vpxor 64(%rdi), %ymm13, %ymm13 + vpxor 96(%rdi), %ymm11, %ymm11 + vpxor 128(%rdi), %ymm14, %ymm14 + vpxor -96(%rax), %ymm12, %ymm12 + vpxor -64(%rax), %ymm10, %ymm10 + vpxor -32(%rax), %ymm13, %ymm13 + vpxor (%rax), %ymm11, %ymm11 + vpxor 32(%rax), %ymm14, %ymm14 + vpxor 64(%rax), %ymm12, %ymm12 + vpxor (%rcx), %ymm10, %ymm10 + vpxor 32(%rcx), %ymm13, %ymm13 + vpxor 64(%rcx), %ymm11, %ymm11 + vpxor 96(%rcx), %ymm14, %ymm14 + vpxor 128(%rcx), %ymm12, %ymm12 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor (%rax), %ymm6, %ymm11 + vpxor 128(%rcx), %ymm7, %ymm12 + vpxor 64(%rdi), %ymm8, %ymm13 + vpxor -64(%rcx), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 192(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, (%rax) + vmovdqu %ymm2, 128(%rcx) + vmovdqu %ymm3, 64(%rdi) + vmovdqu %ymm4, -64(%rcx) + # Row 1 + vpxor -96(%rdi), %ymm8, %ymm10 + vpxor 32(%rax), %ymm9, %ymm11 + vpxor (%rcx), %ymm5, %ymm12 + vpxor 96(%rdi), %ymm6, %ymm13 + vpxor -32(%rcx), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rdi) + vmovdqu %ymm1, 32(%rax) + vmovdqu %ymm2, (%rcx) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, -32(%rcx) + # Row 2 + vpxor -64(%rdi), %ymm6, %ymm10 + vpxor 64(%rax), %ymm7, %ymm11 + vpxor 32(%rcx), %ymm8, %ymm12 + vpxor 128(%rdi), %ymm9, %ymm13 + vpxor 96(%rax), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rdi) + vmovdqu %ymm1, 64(%rax) + vmovdqu %ymm2, 32(%rcx) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, 96(%rax) + # Row 3 + vpxor -32(%rdi), %ymm9, %ymm10 + vpxor -64(%rax), %ymm5, %ymm11 + vpxor 64(%rcx), %ymm6, %ymm12 + vpxor -96(%rax), %ymm7, %ymm13 + vpxor 128(%rax), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rdi) + vmovdqu %ymm1, -64(%rax) + vmovdqu %ymm2, 64(%rcx) + vmovdqu %ymm3, -96(%rax) + vmovdqu %ymm4, 128(%rax) + # Row 4 + vpxor (%rdi), %ymm7, %ymm10 + vpxor -32(%rax), %ymm8, %ymm11 + vpxor 96(%rcx), %ymm9, %ymm12 + vpxor 32(%rdi), %ymm5, %ymm13 + vpxor -96(%rcx), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -32(%rax) + vmovdqu %ymm2, 96(%rcx) + vmovdqu %ymm3, 32(%rdi) + vmovdqu %ymm4, -96(%rcx) + # Round 7 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm10, %ymm10 + vpxor -64(%rdi), %ymm10, %ymm10 + vpxor -32(%rdi), %ymm10, %ymm10 + vpxor 64(%rdi), %ymm3, %ymm13 + vpxor 96(%rdi), %ymm13, %ymm13 + vpxor 128(%rdi), %ymm13, %ymm13 + vpxor -96(%rax), %ymm13, %ymm13 + vpxor -64(%rax), %ymm1, %ymm11 + vpxor (%rax), %ymm11, %ymm11 + vpxor 32(%rax), %ymm11, %ymm11 + vpxor 64(%rax), %ymm11, %ymm11 + vpxor 96(%rax), %ymm4, %ymm14 + vpxor 128(%rax), %ymm14, %ymm14 + vpxor -64(%rcx), %ymm14, %ymm14 + vpxor -32(%rcx), %ymm14, %ymm14 + vpxor (%rcx), %ymm2, %ymm12 + vpxor 32(%rcx), %ymm12, %ymm12 + vpxor 64(%rcx), %ymm12, %ymm12 + vpxor 128(%rcx), %ymm12, %ymm12 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 32(%rax), %ymm6, %ymm11 + vpxor 32(%rcx), %ymm7, %ymm12 + vpxor -96(%rax), %ymm8, %ymm13 + vpxor -96(%rcx), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 224(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 32(%rax) + vmovdqu %ymm2, 32(%rcx) + vmovdqu %ymm3, -96(%rax) + vmovdqu %ymm4, -96(%rcx) + # Row 1 + vpxor 64(%rdi), %ymm8, %ymm10 + vpxor -32(%rcx), %ymm9, %ymm11 + vpxor -64(%rdi), %ymm5, %ymm12 + vpxor -64(%rax), %ymm6, %ymm13 + vpxor 96(%rcx), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm1, -32(%rcx) + vmovdqu %ymm2, -64(%rdi) + vmovdqu %ymm3, -64(%rax) + vmovdqu %ymm4, 96(%rcx) + # Row 2 + vpxor (%rax), %ymm6, %ymm10 + vpxor (%rcx), %ymm7, %ymm11 + vpxor 128(%rdi), %ymm8, %ymm12 + vpxor 128(%rax), %ymm9, %ymm13 + vpxor (%rdi), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rax) + vmovdqu %ymm1, (%rcx) + vmovdqu %ymm2, 128(%rdi) + vmovdqu %ymm3, 128(%rax) + vmovdqu %ymm4, (%rdi) + # Row 3 + vpxor -64(%rcx), %ymm9, %ymm10 + vpxor -96(%rdi), %ymm5, %ymm11 + vpxor 64(%rax), %ymm6, %ymm12 + vpxor 64(%rcx), %ymm7, %ymm13 + vpxor 32(%rdi), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rcx) + vmovdqu %ymm1, -96(%rdi) + vmovdqu %ymm2, 64(%rax) + vmovdqu %ymm3, 64(%rcx) + vmovdqu %ymm4, 32(%rdi) + # Row 4 + vpxor 128(%rcx), %ymm7, %ymm10 + vpxor 96(%rdi), %ymm8, %ymm11 + vpxor 96(%rax), %ymm9, %ymm12 + vpxor -32(%rdi), %ymm5, %ymm13 + vpxor -32(%rax), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rcx) + vmovdqu %ymm1, 96(%rdi) + vmovdqu %ymm2, 96(%rax) + vmovdqu %ymm3, -32(%rdi) + vmovdqu %ymm4, -32(%rax) + # Round 8 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm1, %ymm11 + vpxor -64(%rdi), %ymm2, %ymm12 + vpxor (%rdi), %ymm4, %ymm14 + vpxor 32(%rdi), %ymm14, %ymm14 + vpxor 64(%rdi), %ymm10, %ymm10 + vpxor 128(%rdi), %ymm12, %ymm12 + vpxor -96(%rax), %ymm3, %ymm13 + vpxor -64(%rax), %ymm13, %ymm13 + vpxor (%rax), %ymm10, %ymm10 + vpxor 32(%rax), %ymm11, %ymm11 + vpxor 64(%rax), %ymm12, %ymm12 + vpxor 128(%rax), %ymm13, %ymm13 + vpxor -96(%rcx), %ymm14, %ymm14 + vpxor -64(%rcx), %ymm10, %ymm10 + vpxor -32(%rcx), %ymm11, %ymm11 + vpxor (%rcx), %ymm11, %ymm11 + vpxor 32(%rcx), %ymm12, %ymm12 + vpxor 64(%rcx), %ymm13, %ymm13 + vpxor 96(%rcx), %ymm14, %ymm14 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -32(%rcx), %ymm6, %ymm11 + vpxor 128(%rdi), %ymm7, %ymm12 + vpxor 64(%rcx), %ymm8, %ymm13 + vpxor -32(%rax), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 256(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -32(%rcx) + vmovdqu %ymm2, 128(%rdi) + vmovdqu %ymm3, 64(%rcx) + vmovdqu %ymm4, -32(%rax) + # Row 1 + vpxor -96(%rax), %ymm8, %ymm10 + vpxor 96(%rcx), %ymm9, %ymm11 + vpxor (%rax), %ymm5, %ymm12 + vpxor -96(%rdi), %ymm6, %ymm13 + vpxor 96(%rax), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rax) + vmovdqu %ymm1, 96(%rcx) + vmovdqu %ymm2, (%rax) + vmovdqu %ymm3, -96(%rdi) + vmovdqu %ymm4, 96(%rax) + # Row 2 + vpxor 32(%rax), %ymm6, %ymm10 + vpxor -64(%rdi), %ymm7, %ymm11 + vpxor 128(%rax), %ymm8, %ymm12 + vpxor 32(%rdi), %ymm9, %ymm13 + vpxor 128(%rcx), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rax) + vmovdqu %ymm1, -64(%rdi) + vmovdqu %ymm2, 128(%rax) + vmovdqu %ymm3, 32(%rdi) + vmovdqu %ymm4, 128(%rcx) + # Row 3 + vpxor -96(%rcx), %ymm9, %ymm10 + vpxor 64(%rdi), %ymm5, %ymm11 + vpxor (%rcx), %ymm6, %ymm12 + vpxor 64(%rax), %ymm7, %ymm13 + vpxor -32(%rdi), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rcx) + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, (%rcx) + vmovdqu %ymm3, 64(%rax) + vmovdqu %ymm4, -32(%rdi) + # Row 4 + vpxor 32(%rcx), %ymm7, %ymm10 + vpxor -64(%rax), %ymm8, %ymm11 + vpxor (%rdi), %ymm9, %ymm12 + vpxor -64(%rcx), %ymm5, %ymm13 + vpxor 96(%rdi), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rcx) + vmovdqu %ymm1, -64(%rax) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm3, -64(%rcx) + vmovdqu %ymm4, 96(%rdi) + # Round 9 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm3, %ymm13 + vpxor -64(%rdi), %ymm1, %ymm11 + vpxor -32(%rdi), %ymm4, %ymm14 + vpxor 32(%rdi), %ymm13, %ymm13 + vpxor 64(%rdi), %ymm11, %ymm11 + vpxor 128(%rdi), %ymm2, %ymm12 + vpxor -96(%rax), %ymm10, %ymm10 + vpxor -32(%rax), %ymm14, %ymm14 + vpxor (%rax), %ymm12, %ymm12 + vpxor 32(%rax), %ymm10, %ymm10 + vpxor 64(%rax), %ymm13, %ymm13 + vpxor 96(%rax), %ymm14, %ymm14 + vpxor 128(%rax), %ymm12, %ymm12 + vpxor -96(%rcx), %ymm10, %ymm10 + vpxor -32(%rcx), %ymm11, %ymm11 + vpxor (%rcx), %ymm12, %ymm12 + vpxor 64(%rcx), %ymm13, %ymm13 + vpxor 96(%rcx), %ymm11, %ymm11 + vpxor 128(%rcx), %ymm14, %ymm14 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 96(%rcx), %ymm6, %ymm11 + vpxor 128(%rax), %ymm7, %ymm12 + vpxor 64(%rax), %ymm8, %ymm13 + vpxor 96(%rdi), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 288(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 96(%rcx) + vmovdqu %ymm2, 128(%rax) + vmovdqu %ymm3, 64(%rax) + vmovdqu %ymm4, 96(%rdi) + # Row 1 + vpxor 64(%rcx), %ymm8, %ymm10 + vpxor 96(%rax), %ymm9, %ymm11 + vpxor 32(%rax), %ymm5, %ymm12 + vpxor 64(%rdi), %ymm6, %ymm13 + vpxor (%rdi), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rcx) + vmovdqu %ymm1, 96(%rax) + vmovdqu %ymm2, 32(%rax) + vmovdqu %ymm3, 64(%rdi) + vmovdqu %ymm4, (%rdi) + # Row 2 + vpxor -32(%rcx), %ymm6, %ymm10 + vpxor (%rax), %ymm7, %ymm11 + vpxor 32(%rdi), %ymm8, %ymm12 + vpxor -32(%rdi), %ymm9, %ymm13 + vpxor 32(%rcx), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rcx) + vmovdqu %ymm1, (%rax) + vmovdqu %ymm2, 32(%rdi) + vmovdqu %ymm3, -32(%rdi) + vmovdqu %ymm4, 32(%rcx) + # Row 3 + vpxor -32(%rax), %ymm9, %ymm10 + vpxor -96(%rax), %ymm5, %ymm11 + vpxor -64(%rdi), %ymm6, %ymm12 + vpxor (%rcx), %ymm7, %ymm13 + vpxor -64(%rcx), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rax) + vmovdqu %ymm1, -96(%rax) + vmovdqu %ymm2, -64(%rdi) + vmovdqu %ymm3, (%rcx) + vmovdqu %ymm4, -64(%rcx) + # Row 4 + vpxor 128(%rdi), %ymm7, %ymm10 + vpxor -96(%rdi), %ymm8, %ymm11 + vpxor 128(%rcx), %ymm9, %ymm12 + vpxor -96(%rcx), %ymm5, %ymm13 + vpxor -64(%rax), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rdi) + vmovdqu %ymm1, -96(%rdi) + vmovdqu %ymm2, 128(%rcx) + vmovdqu %ymm3, -96(%rcx) + vmovdqu %ymm4, -64(%rax) + # Round 10 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -64(%rdi), %ymm2, %ymm12 + vpxor -32(%rdi), %ymm3, %ymm13 + vpxor (%rdi), %ymm4, %ymm14 + vpxor 32(%rdi), %ymm12, %ymm12 + vpxor 64(%rdi), %ymm13, %ymm13 + vpxor 96(%rdi), %ymm14, %ymm14 + vpxor -96(%rax), %ymm1, %ymm11 + vpxor -32(%rax), %ymm10, %ymm10 + vpxor (%rax), %ymm11, %ymm11 + vpxor 32(%rax), %ymm12, %ymm12 + vpxor 64(%rax), %ymm13, %ymm13 + vpxor 96(%rax), %ymm11, %ymm11 + vpxor 128(%rax), %ymm12, %ymm12 + vpxor -64(%rcx), %ymm14, %ymm14 + vpxor -32(%rcx), %ymm10, %ymm10 + vpxor (%rcx), %ymm13, %ymm13 + vpxor 32(%rcx), %ymm14, %ymm14 + vpxor 64(%rcx), %ymm10, %ymm10 + vpxor 96(%rcx), %ymm11, %ymm11 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 96(%rax), %ymm6, %ymm11 + vpxor 32(%rdi), %ymm7, %ymm12 + vpxor (%rcx), %ymm8, %ymm13 + vpxor -64(%rax), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 320(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 96(%rax) + vmovdqu %ymm2, 32(%rdi) + vmovdqu %ymm3, (%rcx) + vmovdqu %ymm4, -64(%rax) + # Row 1 + vpxor 64(%rax), %ymm8, %ymm10 + vpxor (%rdi), %ymm9, %ymm11 + vpxor -32(%rcx), %ymm5, %ymm12 + vpxor -96(%rax), %ymm6, %ymm13 + vpxor 128(%rcx), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rax) + vmovdqu %ymm1, (%rdi) + vmovdqu %ymm2, -32(%rcx) + vmovdqu %ymm3, -96(%rax) + vmovdqu %ymm4, 128(%rcx) + # Row 2 + vpxor 96(%rcx), %ymm6, %ymm10 + vpxor 32(%rax), %ymm7, %ymm11 + vpxor -32(%rdi), %ymm8, %ymm12 + vpxor -64(%rcx), %ymm9, %ymm13 + vpxor 128(%rdi), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rcx) + vmovdqu %ymm1, 32(%rax) + vmovdqu %ymm2, -32(%rdi) + vmovdqu %ymm3, -64(%rcx) + vmovdqu %ymm4, 128(%rdi) + # Row 3 + vpxor 96(%rdi), %ymm9, %ymm10 + vpxor 64(%rcx), %ymm5, %ymm11 + vpxor (%rax), %ymm6, %ymm12 + vpxor -64(%rdi), %ymm7, %ymm13 + vpxor -96(%rcx), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm1, 64(%rcx) + vmovdqu %ymm2, (%rax) + vmovdqu %ymm3, -64(%rdi) + vmovdqu %ymm4, -96(%rcx) + # Row 4 + vpxor 128(%rax), %ymm7, %ymm10 + vpxor 64(%rdi), %ymm8, %ymm11 + vpxor 32(%rcx), %ymm9, %ymm12 + vpxor -32(%rax), %ymm5, %ymm13 + vpxor -96(%rdi), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rax) + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, 32(%rcx) + vmovdqu %ymm3, -32(%rax) + vmovdqu %ymm4, -96(%rdi) + # Round 11 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -64(%rdi), %ymm3, %ymm13 + vpxor -32(%rdi), %ymm2, %ymm12 + vpxor (%rdi), %ymm1, %ymm11 + vpxor 32(%rdi), %ymm12, %ymm12 + vpxor 96(%rdi), %ymm10, %ymm10 + vpxor 128(%rdi), %ymm4, %ymm14 + vpxor -96(%rax), %ymm13, %ymm13 + vpxor -64(%rax), %ymm14, %ymm14 + vpxor (%rax), %ymm12, %ymm12 + vpxor 32(%rax), %ymm11, %ymm11 + vpxor 64(%rax), %ymm10, %ymm10 + vpxor 96(%rax), %ymm11, %ymm11 + vpxor -96(%rcx), %ymm14, %ymm14 + vpxor -64(%rcx), %ymm13, %ymm13 + vpxor -32(%rcx), %ymm12, %ymm12 + vpxor (%rcx), %ymm13, %ymm13 + vpxor 64(%rcx), %ymm11, %ymm11 + vpxor 96(%rcx), %ymm10, %ymm10 + vpxor 128(%rcx), %ymm14, %ymm14 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor (%rdi), %ymm6, %ymm11 + vpxor -32(%rdi), %ymm7, %ymm12 + vpxor -64(%rdi), %ymm8, %ymm13 + vpxor -96(%rdi), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 352(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, (%rdi) + vmovdqu %ymm2, -32(%rdi) + vmovdqu %ymm3, -64(%rdi) + vmovdqu %ymm4, -96(%rdi) + # Row 1 + vpxor (%rcx), %ymm8, %ymm10 + vpxor 128(%rcx), %ymm9, %ymm11 + vpxor 96(%rcx), %ymm5, %ymm12 + vpxor 64(%rcx), %ymm6, %ymm13 + vpxor 32(%rcx), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rcx) + vmovdqu %ymm1, 128(%rcx) + vmovdqu %ymm2, 96(%rcx) + vmovdqu %ymm3, 64(%rcx) + vmovdqu %ymm4, 32(%rcx) + # Row 2 + vpxor 96(%rax), %ymm6, %ymm10 + vpxor -32(%rcx), %ymm7, %ymm11 + vpxor -64(%rcx), %ymm8, %ymm12 + vpxor -96(%rcx), %ymm9, %ymm13 + vpxor 128(%rax), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rax) + vmovdqu %ymm1, -32(%rcx) + vmovdqu %ymm2, -64(%rcx) + vmovdqu %ymm3, -96(%rcx) + vmovdqu %ymm4, 128(%rax) + # Row 3 + vpxor -64(%rax), %ymm9, %ymm10 + vpxor 64(%rax), %ymm5, %ymm11 + vpxor 32(%rax), %ymm6, %ymm12 + vpxor (%rax), %ymm7, %ymm13 + vpxor -32(%rax), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rax) + vmovdqu %ymm1, 64(%rax) + vmovdqu %ymm2, 32(%rax) + vmovdqu %ymm3, (%rax) + vmovdqu %ymm4, -32(%rax) + # Row 4 + vpxor 32(%rdi), %ymm7, %ymm10 + vpxor -96(%rax), %ymm8, %ymm11 + vpxor 128(%rdi), %ymm9, %ymm12 + vpxor 96(%rdi), %ymm5, %ymm13 + vpxor 64(%rdi), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rdi) + vmovdqu %ymm1, -96(%rax) + vmovdqu %ymm2, 128(%rdi) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, 64(%rdi) + # Round 12 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm4, %ymm14 + vpxor -64(%rdi), %ymm3, %ymm13 + vpxor -32(%rdi), %ymm2, %ymm12 + vpxor (%rdi), %ymm1, %ymm11 + vpxor -64(%rax), %ymm10, %ymm10 + vpxor -32(%rax), %ymm14, %ymm14 + vpxor (%rax), %ymm13, %ymm13 + vpxor 32(%rax), %ymm12, %ymm12 + vpxor 64(%rax), %ymm11, %ymm11 + vpxor 96(%rax), %ymm10, %ymm10 + vpxor 128(%rax), %ymm14, %ymm14 + vpxor -96(%rcx), %ymm13, %ymm13 + vpxor -64(%rcx), %ymm12, %ymm12 + vpxor -32(%rcx), %ymm11, %ymm11 + vpxor (%rcx), %ymm10, %ymm10 + vpxor 32(%rcx), %ymm14, %ymm14 + vpxor 64(%rcx), %ymm13, %ymm13 + vpxor 96(%rcx), %ymm12, %ymm12 + vpxor 128(%rcx), %ymm11, %ymm11 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 128(%rcx), %ymm6, %ymm11 + vpxor -64(%rcx), %ymm7, %ymm12 + vpxor (%rax), %ymm8, %ymm13 + vpxor 64(%rdi), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 384(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 128(%rcx) + vmovdqu %ymm2, -64(%rcx) + vmovdqu %ymm3, (%rax) + vmovdqu %ymm4, 64(%rdi) + # Row 1 + vpxor -64(%rdi), %ymm8, %ymm10 + vpxor 32(%rcx), %ymm9, %ymm11 + vpxor 96(%rax), %ymm5, %ymm12 + vpxor 64(%rax), %ymm6, %ymm13 + vpxor 128(%rdi), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rdi) + vmovdqu %ymm1, 32(%rcx) + vmovdqu %ymm2, 96(%rax) + vmovdqu %ymm3, 64(%rax) + vmovdqu %ymm4, 128(%rdi) + # Row 2 + vpxor (%rdi), %ymm6, %ymm10 + vpxor 96(%rcx), %ymm7, %ymm11 + vpxor -96(%rcx), %ymm8, %ymm12 + vpxor -32(%rax), %ymm9, %ymm13 + vpxor 32(%rdi), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 96(%rcx) + vmovdqu %ymm2, -96(%rcx) + vmovdqu %ymm3, -32(%rax) + vmovdqu %ymm4, 32(%rdi) + # Row 3 + vpxor -96(%rdi), %ymm9, %ymm10 + vpxor (%rcx), %ymm5, %ymm11 + vpxor -32(%rcx), %ymm6, %ymm12 + vpxor 32(%rax), %ymm7, %ymm13 + vpxor 96(%rdi), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rdi) + vmovdqu %ymm1, (%rcx) + vmovdqu %ymm2, -32(%rcx) + vmovdqu %ymm3, 32(%rax) + vmovdqu %ymm4, 96(%rdi) + # Row 4 + vpxor -32(%rdi), %ymm7, %ymm10 + vpxor 64(%rcx), %ymm8, %ymm11 + vpxor 128(%rax), %ymm9, %ymm12 + vpxor -64(%rax), %ymm5, %ymm13 + vpxor -96(%rax), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rdi) + vmovdqu %ymm1, 64(%rcx) + vmovdqu %ymm2, 128(%rax) + vmovdqu %ymm3, -64(%rax) + vmovdqu %ymm4, -96(%rax) + # Round 13 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm10, %ymm10 + vpxor -64(%rdi), %ymm10, %ymm10 + vpxor (%rdi), %ymm10, %ymm10 + vpxor 32(%rdi), %ymm4, %ymm14 + vpxor 64(%rdi), %ymm14, %ymm14 + vpxor 96(%rdi), %ymm14, %ymm14 + vpxor 128(%rdi), %ymm14, %ymm14 + vpxor -32(%rax), %ymm3, %ymm13 + vpxor (%rax), %ymm13, %ymm13 + vpxor 32(%rax), %ymm13, %ymm13 + vpxor 64(%rax), %ymm13, %ymm13 + vpxor 96(%rax), %ymm2, %ymm12 + vpxor -96(%rcx), %ymm12, %ymm12 + vpxor -64(%rcx), %ymm12, %ymm12 + vpxor -32(%rcx), %ymm12, %ymm12 + vpxor (%rcx), %ymm1, %ymm11 + vpxor 32(%rcx), %ymm11, %ymm11 + vpxor 96(%rcx), %ymm11, %ymm11 + vpxor 128(%rcx), %ymm11, %ymm11 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 32(%rcx), %ymm6, %ymm11 + vpxor -96(%rcx), %ymm7, %ymm12 + vpxor 32(%rax), %ymm8, %ymm13 + vpxor -96(%rax), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 416(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 32(%rcx) + vmovdqu %ymm2, -96(%rcx) + vmovdqu %ymm3, 32(%rax) + vmovdqu %ymm4, -96(%rax) + # Row 1 + vpxor (%rax), %ymm8, %ymm10 + vpxor 128(%rdi), %ymm9, %ymm11 + vpxor (%rdi), %ymm5, %ymm12 + vpxor (%rcx), %ymm6, %ymm13 + vpxor 128(%rax), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rax) + vmovdqu %ymm1, 128(%rdi) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm3, (%rcx) + vmovdqu %ymm4, 128(%rax) + # Row 2 + vpxor 128(%rcx), %ymm6, %ymm10 + vpxor 96(%rax), %ymm7, %ymm11 + vpxor -32(%rax), %ymm8, %ymm12 + vpxor 96(%rdi), %ymm9, %ymm13 + vpxor -32(%rdi), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rcx) + vmovdqu %ymm1, 96(%rax) + vmovdqu %ymm2, -32(%rax) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, -32(%rdi) + # Row 3 + vpxor 64(%rdi), %ymm9, %ymm10 + vpxor -64(%rdi), %ymm5, %ymm11 + vpxor 96(%rcx), %ymm6, %ymm12 + vpxor -32(%rcx), %ymm7, %ymm13 + vpxor -64(%rax), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm1, -64(%rdi) + vmovdqu %ymm2, 96(%rcx) + vmovdqu %ymm3, -32(%rcx) + vmovdqu %ymm4, -64(%rax) + # Row 4 + vpxor -64(%rcx), %ymm7, %ymm10 + vpxor 64(%rax), %ymm8, %ymm11 + vpxor 32(%rdi), %ymm9, %ymm12 + vpxor -96(%rdi), %ymm5, %ymm13 + vpxor 64(%rcx), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rcx) + vmovdqu %ymm1, 64(%rax) + vmovdqu %ymm2, 32(%rdi) + vmovdqu %ymm3, -96(%rdi) + vmovdqu %ymm4, 64(%rcx) + # Round 14 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -64(%rdi), %ymm1, %ymm11 + vpxor -32(%rdi), %ymm4, %ymm14 + vpxor (%rdi), %ymm2, %ymm12 + vpxor 64(%rdi), %ymm10, %ymm10 + vpxor 96(%rdi), %ymm3, %ymm13 + vpxor 128(%rdi), %ymm11, %ymm11 + vpxor -96(%rax), %ymm14, %ymm14 + vpxor -64(%rax), %ymm14, %ymm14 + vpxor -32(%rax), %ymm12, %ymm12 + vpxor (%rax), %ymm10, %ymm10 + vpxor 32(%rax), %ymm13, %ymm13 + vpxor 96(%rax), %ymm11, %ymm11 + vpxor 128(%rax), %ymm14, %ymm14 + vpxor -96(%rcx), %ymm12, %ymm12 + vpxor -32(%rcx), %ymm13, %ymm13 + vpxor (%rcx), %ymm13, %ymm13 + vpxor 32(%rcx), %ymm11, %ymm11 + vpxor 96(%rcx), %ymm12, %ymm12 + vpxor 128(%rcx), %ymm10, %ymm10 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 128(%rdi), %ymm6, %ymm11 + vpxor -32(%rax), %ymm7, %ymm12 + vpxor -32(%rcx), %ymm8, %ymm13 + vpxor 64(%rcx), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 448(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 128(%rdi) + vmovdqu %ymm2, -32(%rax) + vmovdqu %ymm3, -32(%rcx) + vmovdqu %ymm4, 64(%rcx) + # Row 1 + vpxor 32(%rax), %ymm8, %ymm10 + vpxor 128(%rax), %ymm9, %ymm11 + vpxor 128(%rcx), %ymm5, %ymm12 + vpxor -64(%rdi), %ymm6, %ymm13 + vpxor 32(%rdi), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rax) + vmovdqu %ymm1, 128(%rax) + vmovdqu %ymm2, 128(%rcx) + vmovdqu %ymm3, -64(%rdi) + vmovdqu %ymm4, 32(%rdi) + # Row 2 + vpxor 32(%rcx), %ymm6, %ymm10 + vpxor (%rdi), %ymm7, %ymm11 + vpxor 96(%rdi), %ymm8, %ymm12 + vpxor -64(%rax), %ymm9, %ymm13 + vpxor -64(%rcx), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rcx) + vmovdqu %ymm1, (%rdi) + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, -64(%rax) + vmovdqu %ymm4, -64(%rcx) + # Row 3 + vpxor -96(%rax), %ymm9, %ymm10 + vpxor (%rax), %ymm5, %ymm11 + vpxor 96(%rax), %ymm6, %ymm12 + vpxor 96(%rcx), %ymm7, %ymm13 + vpxor -96(%rdi), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rax) + vmovdqu %ymm1, (%rax) + vmovdqu %ymm2, 96(%rax) + vmovdqu %ymm3, 96(%rcx) + vmovdqu %ymm4, -96(%rdi) + # Row 4 + vpxor -96(%rcx), %ymm7, %ymm10 + vpxor (%rcx), %ymm8, %ymm11 + vpxor -32(%rdi), %ymm9, %ymm12 + vpxor 64(%rdi), %ymm5, %ymm13 + vpxor 64(%rax), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rcx) + vmovdqu %ymm1, (%rcx) + vmovdqu %ymm2, -32(%rdi) + vmovdqu %ymm3, 64(%rdi) + vmovdqu %ymm4, 64(%rax) + # Round 15 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm4, %ymm14 + vpxor -64(%rdi), %ymm3, %ymm13 + vpxor (%rdi), %ymm1, %ymm11 + vpxor 32(%rdi), %ymm14, %ymm14 + vpxor 96(%rdi), %ymm2, %ymm12 + vpxor 128(%rdi), %ymm11, %ymm11 + vpxor -96(%rax), %ymm10, %ymm10 + vpxor -64(%rax), %ymm13, %ymm13 + vpxor -32(%rax), %ymm12, %ymm12 + vpxor (%rax), %ymm11, %ymm11 + vpxor 32(%rax), %ymm10, %ymm10 + vpxor 96(%rax), %ymm12, %ymm12 + vpxor 128(%rax), %ymm11, %ymm11 + vpxor -64(%rcx), %ymm14, %ymm14 + vpxor -32(%rcx), %ymm13, %ymm13 + vpxor 32(%rcx), %ymm10, %ymm10 + vpxor 64(%rcx), %ymm14, %ymm14 + vpxor 96(%rcx), %ymm13, %ymm13 + vpxor 128(%rcx), %ymm12, %ymm12 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 128(%rax), %ymm6, %ymm11 + vpxor 96(%rdi), %ymm7, %ymm12 + vpxor 96(%rcx), %ymm8, %ymm13 + vpxor 64(%rax), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 480(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 128(%rax) + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, 96(%rcx) + vmovdqu %ymm4, 64(%rax) + # Row 1 + vpxor -32(%rcx), %ymm8, %ymm10 + vpxor 32(%rdi), %ymm9, %ymm11 + vpxor 32(%rcx), %ymm5, %ymm12 + vpxor (%rax), %ymm6, %ymm13 + vpxor -32(%rdi), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rcx) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, 32(%rcx) + vmovdqu %ymm3, (%rax) + vmovdqu %ymm4, -32(%rdi) + # Row 2 + vpxor 128(%rdi), %ymm6, %ymm10 + vpxor 128(%rcx), %ymm7, %ymm11 + vpxor -64(%rax), %ymm8, %ymm12 + vpxor -96(%rdi), %ymm9, %ymm13 + vpxor -96(%rcx), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rdi) + vmovdqu %ymm1, 128(%rcx) + vmovdqu %ymm2, -64(%rax) + vmovdqu %ymm3, -96(%rdi) + vmovdqu %ymm4, -96(%rcx) + # Row 3 + vpxor 64(%rcx), %ymm9, %ymm10 + vpxor 32(%rax), %ymm5, %ymm11 + vpxor (%rdi), %ymm6, %ymm12 + vpxor 96(%rax), %ymm7, %ymm13 + vpxor 64(%rdi), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rcx) + vmovdqu %ymm1, 32(%rax) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm3, 96(%rax) + vmovdqu %ymm4, 64(%rdi) + # Row 4 + vpxor -32(%rax), %ymm7, %ymm10 + vpxor -64(%rdi), %ymm8, %ymm11 + vpxor -64(%rcx), %ymm9, %ymm12 + vpxor -96(%rax), %ymm5, %ymm13 + vpxor (%rcx), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rax) + vmovdqu %ymm1, -64(%rdi) + vmovdqu %ymm2, -64(%rcx) + vmovdqu %ymm3, -96(%rax) + vmovdqu %ymm4, (%rcx) + # Round 16 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm3, %ymm13 + vpxor -32(%rdi), %ymm4, %ymm14 + vpxor (%rdi), %ymm2, %ymm12 + vpxor 32(%rdi), %ymm1, %ymm11 + vpxor 64(%rdi), %ymm14, %ymm14 + vpxor 96(%rdi), %ymm12, %ymm12 + vpxor 128(%rdi), %ymm10, %ymm10 + vpxor -64(%rax), %ymm12, %ymm12 + vpxor (%rax), %ymm13, %ymm13 + vpxor 32(%rax), %ymm11, %ymm11 + vpxor 64(%rax), %ymm14, %ymm14 + vpxor 96(%rax), %ymm13, %ymm13 + vpxor 128(%rax), %ymm11, %ymm11 + vpxor -96(%rcx), %ymm14, %ymm14 + vpxor -32(%rcx), %ymm10, %ymm10 + vpxor 32(%rcx), %ymm12, %ymm12 + vpxor 64(%rcx), %ymm10, %ymm10 + vpxor 96(%rcx), %ymm13, %ymm13 + vpxor 128(%rcx), %ymm11, %ymm11 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 32(%rdi), %ymm6, %ymm11 + vpxor -64(%rax), %ymm7, %ymm12 + vpxor 96(%rax), %ymm8, %ymm13 + vpxor (%rcx), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 512(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, -64(%rax) + vmovdqu %ymm3, 96(%rax) + vmovdqu %ymm4, (%rcx) + # Row 1 + vpxor 96(%rcx), %ymm8, %ymm10 + vpxor -32(%rdi), %ymm9, %ymm11 + vpxor 128(%rdi), %ymm5, %ymm12 + vpxor 32(%rax), %ymm6, %ymm13 + vpxor -64(%rcx), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rcx) + vmovdqu %ymm1, -32(%rdi) + vmovdqu %ymm2, 128(%rdi) + vmovdqu %ymm3, 32(%rax) + vmovdqu %ymm4, -64(%rcx) + # Row 2 + vpxor 128(%rax), %ymm6, %ymm10 + vpxor 32(%rcx), %ymm7, %ymm11 + vpxor -96(%rdi), %ymm8, %ymm12 + vpxor 64(%rdi), %ymm9, %ymm13 + vpxor -32(%rax), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rax) + vmovdqu %ymm1, 32(%rcx) + vmovdqu %ymm2, -96(%rdi) + vmovdqu %ymm3, 64(%rdi) + vmovdqu %ymm4, -32(%rax) + # Row 3 + vpxor 64(%rax), %ymm9, %ymm10 + vpxor -32(%rcx), %ymm5, %ymm11 + vpxor 128(%rcx), %ymm6, %ymm12 + vpxor (%rdi), %ymm7, %ymm13 + vpxor -96(%rax), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rax) + vmovdqu %ymm1, -32(%rcx) + vmovdqu %ymm2, 128(%rcx) + vmovdqu %ymm3, (%rdi) + vmovdqu %ymm4, -96(%rax) + # Row 4 + vpxor 96(%rdi), %ymm7, %ymm10 + vpxor (%rax), %ymm8, %ymm11 + vpxor -96(%rcx), %ymm9, %ymm12 + vpxor 64(%rcx), %ymm5, %ymm13 + vpxor -64(%rdi), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm1, (%rax) + vmovdqu %ymm2, -96(%rcx) + vmovdqu %ymm3, 64(%rcx) + vmovdqu %ymm4, -64(%rdi) + # Round 17 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm2, %ymm12 + vpxor -32(%rdi), %ymm1, %ymm11 + vpxor (%rdi), %ymm3, %ymm13 + vpxor 32(%rdi), %ymm11, %ymm11 + vpxor 64(%rdi), %ymm13, %ymm13 + vpxor 128(%rdi), %ymm12, %ymm12 + vpxor -96(%rax), %ymm4, %ymm14 + vpxor -64(%rax), %ymm12, %ymm12 + vpxor -32(%rax), %ymm14, %ymm14 + vpxor 32(%rax), %ymm13, %ymm13 + vpxor 64(%rax), %ymm10, %ymm10 + vpxor 96(%rax), %ymm13, %ymm13 + vpxor 128(%rax), %ymm10, %ymm10 + vpxor -64(%rcx), %ymm14, %ymm14 + vpxor -32(%rcx), %ymm11, %ymm11 + vpxor (%rcx), %ymm14, %ymm14 + vpxor 32(%rcx), %ymm11, %ymm11 + vpxor 96(%rcx), %ymm10, %ymm10 + vpxor 128(%rcx), %ymm12, %ymm12 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -32(%rdi), %ymm6, %ymm11 + vpxor -96(%rdi), %ymm7, %ymm12 + vpxor (%rdi), %ymm8, %ymm13 + vpxor -64(%rdi), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 544(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -32(%rdi) + vmovdqu %ymm2, -96(%rdi) + vmovdqu %ymm3, (%rdi) + vmovdqu %ymm4, -64(%rdi) + # Row 1 + vpxor 96(%rax), %ymm8, %ymm10 + vpxor -64(%rcx), %ymm9, %ymm11 + vpxor 128(%rax), %ymm5, %ymm12 + vpxor -32(%rcx), %ymm6, %ymm13 + vpxor -96(%rcx), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rax) + vmovdqu %ymm1, -64(%rcx) + vmovdqu %ymm2, 128(%rax) + vmovdqu %ymm3, -32(%rcx) + vmovdqu %ymm4, -96(%rcx) + # Row 2 + vpxor 32(%rdi), %ymm6, %ymm10 + vpxor 128(%rdi), %ymm7, %ymm11 + vpxor 64(%rdi), %ymm8, %ymm12 + vpxor -96(%rax), %ymm9, %ymm13 + vpxor 96(%rdi), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rdi) + vmovdqu %ymm1, 128(%rdi) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, -96(%rax) + vmovdqu %ymm4, 96(%rdi) + # Row 3 + vpxor (%rcx), %ymm9, %ymm10 + vpxor 96(%rcx), %ymm5, %ymm11 + vpxor 32(%rcx), %ymm6, %ymm12 + vpxor 128(%rcx), %ymm7, %ymm13 + vpxor 64(%rcx), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rcx) + vmovdqu %ymm1, 96(%rcx) + vmovdqu %ymm2, 32(%rcx) + vmovdqu %ymm3, 128(%rcx) + vmovdqu %ymm4, 64(%rcx) + # Row 4 + vpxor -64(%rax), %ymm7, %ymm10 + vpxor 32(%rax), %ymm8, %ymm11 + vpxor -32(%rax), %ymm9, %ymm12 + vpxor 64(%rax), %ymm5, %ymm13 + vpxor (%rax), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rax) + vmovdqu %ymm1, 32(%rax) + vmovdqu %ymm2, -32(%rax) + vmovdqu %ymm3, 64(%rax) + vmovdqu %ymm4, (%rax) + # Round 18 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm2, %ymm12 + vpxor -64(%rdi), %ymm4, %ymm14 + vpxor -32(%rdi), %ymm1, %ymm11 + vpxor (%rdi), %ymm3, %ymm13 + vpxor 32(%rdi), %ymm10, %ymm10 + vpxor 64(%rdi), %ymm12, %ymm12 + vpxor 96(%rdi), %ymm14, %ymm14 + vpxor 128(%rdi), %ymm11, %ymm11 + vpxor -96(%rax), %ymm13, %ymm13 + vpxor 96(%rax), %ymm10, %ymm10 + vpxor 128(%rax), %ymm12, %ymm12 + vpxor -96(%rcx), %ymm14, %ymm14 + vpxor -64(%rcx), %ymm11, %ymm11 + vpxor -32(%rcx), %ymm13, %ymm13 + vpxor (%rcx), %ymm10, %ymm10 + vpxor 32(%rcx), %ymm12, %ymm12 + vpxor 64(%rcx), %ymm14, %ymm14 + vpxor 96(%rcx), %ymm11, %ymm11 + vpxor 128(%rcx), %ymm13, %ymm13 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -64(%rcx), %ymm6, %ymm11 + vpxor 64(%rdi), %ymm7, %ymm12 + vpxor 128(%rcx), %ymm8, %ymm13 + vpxor (%rax), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 576(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -64(%rcx) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, 128(%rcx) + vmovdqu %ymm4, (%rax) + # Row 1 + vpxor (%rdi), %ymm8, %ymm10 + vpxor -96(%rcx), %ymm9, %ymm11 + vpxor 32(%rdi), %ymm5, %ymm12 + vpxor 96(%rcx), %ymm6, %ymm13 + vpxor -32(%rax), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -96(%rcx) + vmovdqu %ymm2, 32(%rdi) + vmovdqu %ymm3, 96(%rcx) + vmovdqu %ymm4, -32(%rax) + # Row 2 + vpxor -32(%rdi), %ymm6, %ymm10 + vpxor 128(%rax), %ymm7, %ymm11 + vpxor -96(%rax), %ymm8, %ymm12 + vpxor 64(%rcx), %ymm9, %ymm13 + vpxor -64(%rax), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rdi) + vmovdqu %ymm1, 128(%rax) + vmovdqu %ymm2, -96(%rax) + vmovdqu %ymm3, 64(%rcx) + vmovdqu %ymm4, -64(%rax) + # Row 3 + vpxor -64(%rdi), %ymm9, %ymm10 + vpxor 96(%rax), %ymm5, %ymm11 + vpxor 128(%rdi), %ymm6, %ymm12 + vpxor 32(%rcx), %ymm7, %ymm13 + vpxor 64(%rax), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rdi) + vmovdqu %ymm1, 96(%rax) + vmovdqu %ymm2, 128(%rdi) + vmovdqu %ymm3, 32(%rcx) + vmovdqu %ymm4, 64(%rax) + # Row 4 + vpxor -96(%rdi), %ymm7, %ymm10 + vpxor -32(%rcx), %ymm8, %ymm11 + vpxor 96(%rdi), %ymm9, %ymm12 + vpxor (%rcx), %ymm5, %ymm13 + vpxor 32(%rax), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rdi) + vmovdqu %ymm1, -32(%rcx) + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, (%rcx) + vmovdqu %ymm4, 32(%rax) + # Round 19 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -64(%rdi), %ymm10, %ymm10 + vpxor -32(%rdi), %ymm10, %ymm10 + vpxor (%rdi), %ymm10, %ymm10 + vpxor 32(%rdi), %ymm2, %ymm12 + vpxor 64(%rdi), %ymm12, %ymm12 + vpxor 128(%rdi), %ymm12, %ymm12 + vpxor -96(%rax), %ymm12, %ymm12 + vpxor -64(%rax), %ymm4, %ymm14 + vpxor -32(%rax), %ymm14, %ymm14 + vpxor (%rax), %ymm14, %ymm14 + vpxor 64(%rax), %ymm14, %ymm14 + vpxor 96(%rax), %ymm1, %ymm11 + vpxor 128(%rax), %ymm11, %ymm11 + vpxor -96(%rcx), %ymm11, %ymm11 + vpxor -64(%rcx), %ymm11, %ymm11 + vpxor 32(%rcx), %ymm3, %ymm13 + vpxor 64(%rcx), %ymm13, %ymm13 + vpxor 96(%rcx), %ymm13, %ymm13 + vpxor 128(%rcx), %ymm13, %ymm13 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -96(%rcx), %ymm6, %ymm11 + vpxor -96(%rax), %ymm7, %ymm12 + vpxor 32(%rcx), %ymm8, %ymm13 + vpxor 32(%rax), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 608(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -96(%rcx) + vmovdqu %ymm2, -96(%rax) + vmovdqu %ymm3, 32(%rcx) + vmovdqu %ymm4, 32(%rax) + # Row 1 + vpxor 128(%rcx), %ymm8, %ymm10 + vpxor -32(%rax), %ymm9, %ymm11 + vpxor -32(%rdi), %ymm5, %ymm12 + vpxor 96(%rax), %ymm6, %ymm13 + vpxor 96(%rdi), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rcx) + vmovdqu %ymm1, -32(%rax) + vmovdqu %ymm2, -32(%rdi) + vmovdqu %ymm3, 96(%rax) + vmovdqu %ymm4, 96(%rdi) + # Row 2 + vpxor -64(%rcx), %ymm6, %ymm10 + vpxor 32(%rdi), %ymm7, %ymm11 + vpxor 64(%rcx), %ymm8, %ymm12 + vpxor 64(%rax), %ymm9, %ymm13 + vpxor -96(%rdi), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rcx) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, 64(%rcx) + vmovdqu %ymm3, 64(%rax) + vmovdqu %ymm4, -96(%rdi) + # Row 3 + vpxor (%rax), %ymm9, %ymm10 + vpxor (%rdi), %ymm5, %ymm11 + vpxor 128(%rax), %ymm6, %ymm12 + vpxor 128(%rdi), %ymm7, %ymm13 + vpxor (%rcx), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rax) + vmovdqu %ymm1, (%rdi) + vmovdqu %ymm2, 128(%rax) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, (%rcx) + # Row 4 + vpxor 64(%rdi), %ymm7, %ymm10 + vpxor 96(%rcx), %ymm8, %ymm11 + vpxor -64(%rax), %ymm9, %ymm12 + vpxor -64(%rdi), %ymm5, %ymm13 + vpxor -32(%rcx), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm1, 96(%rcx) + vmovdqu %ymm2, -64(%rax) + vmovdqu %ymm3, -64(%rdi) + vmovdqu %ymm4, -32(%rcx) + # Round 20 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm4, %ymm14 + vpxor -32(%rdi), %ymm2, %ymm12 + vpxor (%rdi), %ymm1, %ymm11 + vpxor 32(%rdi), %ymm11, %ymm11 + vpxor 96(%rdi), %ymm14, %ymm14 + vpxor 128(%rdi), %ymm3, %ymm13 + vpxor -96(%rax), %ymm12, %ymm12 + vpxor -32(%rax), %ymm11, %ymm11 + vpxor (%rax), %ymm10, %ymm10 + vpxor 32(%rax), %ymm14, %ymm14 + vpxor 64(%rax), %ymm13, %ymm13 + vpxor 96(%rax), %ymm13, %ymm13 + vpxor 128(%rax), %ymm12, %ymm12 + vpxor -96(%rcx), %ymm11, %ymm11 + vpxor -64(%rcx), %ymm10, %ymm10 + vpxor (%rcx), %ymm14, %ymm14 + vpxor 32(%rcx), %ymm13, %ymm13 + vpxor 64(%rcx), %ymm12, %ymm12 + vpxor 128(%rcx), %ymm10, %ymm10 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -32(%rax), %ymm6, %ymm11 + vpxor 64(%rcx), %ymm7, %ymm12 + vpxor 128(%rdi), %ymm8, %ymm13 + vpxor -32(%rcx), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 640(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -32(%rax) + vmovdqu %ymm2, 64(%rcx) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, -32(%rcx) + # Row 1 + vpxor 32(%rcx), %ymm8, %ymm10 + vpxor 96(%rdi), %ymm9, %ymm11 + vpxor -64(%rcx), %ymm5, %ymm12 + vpxor (%rdi), %ymm6, %ymm13 + vpxor -64(%rax), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rcx) + vmovdqu %ymm1, 96(%rdi) + vmovdqu %ymm2, -64(%rcx) + vmovdqu %ymm3, (%rdi) + vmovdqu %ymm4, -64(%rax) + # Row 2 + vpxor -96(%rcx), %ymm6, %ymm10 + vpxor -32(%rdi), %ymm7, %ymm11 + vpxor 64(%rax), %ymm8, %ymm12 + vpxor (%rcx), %ymm9, %ymm13 + vpxor 64(%rdi), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rcx) + vmovdqu %ymm1, -32(%rdi) + vmovdqu %ymm2, 64(%rax) + vmovdqu %ymm3, (%rcx) + vmovdqu %ymm4, 64(%rdi) + # Row 3 + vpxor 32(%rax), %ymm9, %ymm10 + vpxor 128(%rcx), %ymm5, %ymm11 + vpxor 32(%rdi), %ymm6, %ymm12 + vpxor 128(%rax), %ymm7, %ymm13 + vpxor -64(%rdi), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rax) + vmovdqu %ymm1, 128(%rcx) + vmovdqu %ymm2, 32(%rdi) + vmovdqu %ymm3, 128(%rax) + vmovdqu %ymm4, -64(%rdi) + # Row 4 + vpxor -96(%rax), %ymm7, %ymm10 + vpxor 96(%rax), %ymm8, %ymm11 + vpxor -96(%rdi), %ymm9, %ymm12 + vpxor (%rax), %ymm5, %ymm13 + vpxor 96(%rcx), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -96(%rax) + vmovdqu %ymm1, 96(%rax) + vmovdqu %ymm2, -96(%rdi) + vmovdqu %ymm3, (%rax) + vmovdqu %ymm4, 96(%rcx) + # Round 21 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -64(%rdi), %ymm4, %ymm14 + vpxor -32(%rdi), %ymm1, %ymm11 + vpxor (%rdi), %ymm3, %ymm13 + vpxor 32(%rdi), %ymm2, %ymm12 + vpxor 64(%rdi), %ymm14, %ymm14 + vpxor 96(%rdi), %ymm11, %ymm11 + vpxor 128(%rdi), %ymm13, %ymm13 + vpxor -64(%rax), %ymm14, %ymm14 + vpxor -32(%rax), %ymm11, %ymm11 + vpxor 32(%rax), %ymm10, %ymm10 + vpxor 64(%rax), %ymm12, %ymm12 + vpxor 128(%rax), %ymm13, %ymm13 + vpxor -96(%rcx), %ymm10, %ymm10 + vpxor -64(%rcx), %ymm12, %ymm12 + vpxor -32(%rcx), %ymm14, %ymm14 + vpxor (%rcx), %ymm13, %ymm13 + vpxor 32(%rcx), %ymm10, %ymm10 + vpxor 64(%rcx), %ymm12, %ymm12 + vpxor 128(%rcx), %ymm11, %ymm11 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor 96(%rdi), %ymm6, %ymm11 + vpxor 64(%rax), %ymm7, %ymm12 + vpxor 128(%rax), %ymm8, %ymm13 + vpxor 96(%rcx), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 672(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, 96(%rdi) + vmovdqu %ymm2, 64(%rax) + vmovdqu %ymm3, 128(%rax) + vmovdqu %ymm4, 96(%rcx) + # Row 1 + vpxor 128(%rdi), %ymm8, %ymm10 + vpxor -64(%rax), %ymm9, %ymm11 + vpxor -96(%rcx), %ymm5, %ymm12 + vpxor 128(%rcx), %ymm6, %ymm13 + vpxor -96(%rdi), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rdi) + vmovdqu %ymm1, -64(%rax) + vmovdqu %ymm2, -96(%rcx) + vmovdqu %ymm3, 128(%rcx) + vmovdqu %ymm4, -96(%rdi) + # Row 2 + vpxor -32(%rax), %ymm6, %ymm10 + vpxor -64(%rcx), %ymm7, %ymm11 + vpxor (%rcx), %ymm8, %ymm12 + vpxor -64(%rdi), %ymm9, %ymm13 + vpxor -96(%rax), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rax) + vmovdqu %ymm1, -64(%rcx) + vmovdqu %ymm2, (%rcx) + vmovdqu %ymm3, -64(%rdi) + vmovdqu %ymm4, -96(%rax) + # Row 3 + vpxor -32(%rcx), %ymm9, %ymm10 + vpxor 32(%rcx), %ymm5, %ymm11 + vpxor -32(%rdi), %ymm6, %ymm12 + vpxor 32(%rdi), %ymm7, %ymm13 + vpxor (%rax), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -32(%rcx) + vmovdqu %ymm1, 32(%rcx) + vmovdqu %ymm2, -32(%rdi) + vmovdqu %ymm3, 32(%rdi) + vmovdqu %ymm4, (%rax) + # Row 4 + vpxor 64(%rcx), %ymm7, %ymm10 + vpxor (%rdi), %ymm8, %ymm11 + vpxor 64(%rdi), %ymm9, %ymm12 + vpxor 32(%rax), %ymm5, %ymm13 + vpxor 96(%rax), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rcx) + vmovdqu %ymm1, (%rdi) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, 32(%rax) + vmovdqu %ymm4, 96(%rax) + # Round 22 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm4, %ymm14 + vpxor -64(%rdi), %ymm3, %ymm13 + vpxor -32(%rdi), %ymm2, %ymm12 + vpxor 32(%rdi), %ymm13, %ymm13 + vpxor 96(%rdi), %ymm1, %ymm11 + vpxor 128(%rdi), %ymm10, %ymm10 + vpxor -96(%rax), %ymm14, %ymm14 + vpxor -64(%rax), %ymm11, %ymm11 + vpxor -32(%rax), %ymm10, %ymm10 + vpxor (%rax), %ymm14, %ymm14 + vpxor 64(%rax), %ymm12, %ymm12 + vpxor 128(%rax), %ymm13, %ymm13 + vpxor -96(%rcx), %ymm12, %ymm12 + vpxor -64(%rcx), %ymm11, %ymm11 + vpxor -32(%rcx), %ymm10, %ymm10 + vpxor (%rcx), %ymm12, %ymm12 + vpxor 32(%rcx), %ymm11, %ymm11 + vpxor 96(%rcx), %ymm14, %ymm14 + vpxor 128(%rcx), %ymm13, %ymm13 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -64(%rax), %ymm6, %ymm11 + vpxor (%rcx), %ymm7, %ymm12 + vpxor 32(%rdi), %ymm8, %ymm13 + vpxor 96(%rax), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 704(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -64(%rax) + vmovdqu %ymm2, (%rcx) + vmovdqu %ymm3, 32(%rdi) + vmovdqu %ymm4, 96(%rax) + # Row 1 + vpxor 128(%rax), %ymm8, %ymm10 + vpxor -96(%rdi), %ymm9, %ymm11 + vpxor -32(%rax), %ymm5, %ymm12 + vpxor 32(%rcx), %ymm6, %ymm13 + vpxor 64(%rdi), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 128(%rax) + vmovdqu %ymm1, -96(%rdi) + vmovdqu %ymm2, -32(%rax) + vmovdqu %ymm3, 32(%rcx) + vmovdqu %ymm4, 64(%rdi) + # Row 2 + vpxor 96(%rdi), %ymm6, %ymm10 + vpxor -96(%rcx), %ymm7, %ymm11 + vpxor -64(%rdi), %ymm8, %ymm12 + vpxor (%rax), %ymm9, %ymm13 + vpxor 64(%rcx), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm1, -96(%rcx) + vmovdqu %ymm2, -64(%rdi) + vmovdqu %ymm3, (%rax) + vmovdqu %ymm4, 64(%rcx) + # Row 3 + vpxor 96(%rcx), %ymm9, %ymm10 + vpxor 128(%rdi), %ymm5, %ymm11 + vpxor -64(%rcx), %ymm6, %ymm12 + vpxor -32(%rdi), %ymm7, %ymm13 + vpxor 32(%rax), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rcx) + vmovdqu %ymm1, 128(%rdi) + vmovdqu %ymm2, -64(%rcx) + vmovdqu %ymm3, -32(%rdi) + vmovdqu %ymm4, 32(%rax) + # Row 4 + vpxor 64(%rax), %ymm7, %ymm10 + vpxor 128(%rcx), %ymm8, %ymm11 + vpxor -96(%rax), %ymm9, %ymm12 + vpxor -32(%rcx), %ymm5, %ymm13 + vpxor (%rdi), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 64(%rax) + vmovdqu %ymm1, 128(%rcx) + vmovdqu %ymm2, -96(%rax) + vmovdqu %ymm3, -32(%rcx) + vmovdqu %ymm4, (%rdi) + # Round 23 + # Calc b[0..4] + vpxor %ymm15, %ymm0, %ymm10 + vpxor -96(%rdi), %ymm1, %ymm11 + vpxor -64(%rdi), %ymm2, %ymm12 + vpxor -32(%rdi), %ymm3, %ymm13 + vpxor 32(%rdi), %ymm13, %ymm13 + vpxor 64(%rdi), %ymm4, %ymm14 + vpxor 96(%rdi), %ymm10, %ymm10 + vpxor 128(%rdi), %ymm11, %ymm11 + vpxor -64(%rax), %ymm11, %ymm11 + vpxor -32(%rax), %ymm12, %ymm12 + vpxor (%rax), %ymm13, %ymm13 + vpxor 32(%rax), %ymm14, %ymm14 + vpxor 96(%rax), %ymm14, %ymm14 + vpxor 128(%rax), %ymm10, %ymm10 + vpxor -96(%rcx), %ymm11, %ymm11 + vpxor -64(%rcx), %ymm12, %ymm12 + vpxor (%rcx), %ymm12, %ymm12 + vpxor 32(%rcx), %ymm13, %ymm13 + vpxor 64(%rcx), %ymm14, %ymm14 + vpxor 96(%rcx), %ymm10, %ymm10 + # Calc t[0..4] + vpsrlq $63, %ymm11, %ymm0 + vpsrlq $63, %ymm12, %ymm1 + vpsrlq $63, %ymm13, %ymm2 + vpsrlq $63, %ymm14, %ymm3 + vpsrlq $63, %ymm10, %ymm4 + vpaddq %ymm11, %ymm11, %ymm5 + vpaddq %ymm12, %ymm12, %ymm6 + vpaddq %ymm13, %ymm13, %ymm7 + vpaddq %ymm14, %ymm14, %ymm8 + vpaddq %ymm10, %ymm10, %ymm9 + vpor %ymm0, %ymm5, %ymm5 + vpor %ymm1, %ymm6, %ymm6 + vpor %ymm2, %ymm7, %ymm7 + vpor %ymm3, %ymm8, %ymm8 + vpor %ymm4, %ymm9, %ymm9 + vpxor %ymm14, %ymm5, %ymm5 + vpxor %ymm10, %ymm6, %ymm6 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm8, %ymm8 + vpxor %ymm13, %ymm9, %ymm9 + # Row Mix + # Row 0 + vpxor %ymm15, %ymm5, %ymm10 + vpxor -96(%rdi), %ymm6, %ymm11 + vpxor -64(%rdi), %ymm7, %ymm12 + vpxor -32(%rdi), %ymm8, %ymm13 + vpxor (%rdi), %ymm9, %ymm14 + vpsrlq $20, %ymm11, %ymm0 + vpsrlq $21, %ymm12, %ymm1 + vpsrlq $43, %ymm13, %ymm2 + vpsrlq $50, %ymm14, %ymm3 + vpsllq $44, %ymm11, %ymm11 + vpsllq $43, %ymm12, %ymm12 + vpsllq $21, %ymm13, %ymm13 + vpsllq $14, %ymm14, %ymm14 + vpor %ymm0, %ymm11, %ymm11 + vpor %ymm1, %ymm12, %ymm12 + vpor %ymm2, %ymm13, %ymm13 + vpor %ymm3, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + # XOR in constant + vpxor 736(%rdx), %ymm15, %ymm15 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm1, -96(%rdi) + vmovdqu %ymm2, -64(%rdi) + vmovdqu %ymm3, -32(%rdi) + vmovdqu %ymm4, (%rdi) + # Row 1 + vpxor 32(%rdi), %ymm8, %ymm10 + vpxor 64(%rdi), %ymm9, %ymm11 + vpxor 96(%rdi), %ymm5, %ymm12 + vpxor 128(%rdi), %ymm6, %ymm13 + vpxor -96(%rax), %ymm7, %ymm14 + vpsrlq $36, %ymm10, %ymm0 + vpsrlq $44, %ymm11, %ymm1 + vpsrlq $61, %ymm12, %ymm2 + vpsrlq $19, %ymm13, %ymm3 + vpsrlq $3, %ymm14, %ymm4 + vpsllq $28, %ymm10, %ymm10 + vpsllq $20, %ymm11, %ymm11 + vpsllq $3, %ymm12, %ymm12 + vpsllq $45, %ymm13, %ymm13 + vpsllq $61, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 32(%rdi) + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, -96(%rax) + # Row 2 + vpxor -64(%rax), %ymm6, %ymm10 + vpxor -32(%rax), %ymm7, %ymm11 + vpxor (%rax), %ymm8, %ymm12 + vpxor 32(%rax), %ymm9, %ymm13 + vpxor 64(%rax), %ymm5, %ymm14 + vpsrlq $63, %ymm10, %ymm0 + vpsrlq $58, %ymm11, %ymm1 + vpsrlq $39, %ymm12, %ymm2 + vpsrlq $56, %ymm13, %ymm3 + vpsrlq $46, %ymm14, %ymm4 + vpaddq %ymm10, %ymm10, %ymm10 + vpsllq $6, %ymm11, %ymm11 + vpsllq $25, %ymm12, %ymm12 + vpsllq $8, %ymm13, %ymm13 + vpsllq $18, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, -64(%rax) + vmovdqu %ymm1, -32(%rax) + vmovdqu %ymm2, (%rax) + vmovdqu %ymm3, 32(%rax) + vmovdqu %ymm4, 64(%rax) + # Row 3 + vpxor 96(%rax), %ymm9, %ymm10 + vpxor 128(%rax), %ymm5, %ymm11 + vpxor -96(%rcx), %ymm6, %ymm12 + vpxor -64(%rcx), %ymm7, %ymm13 + vpxor -32(%rcx), %ymm8, %ymm14 + vpsrlq $37, %ymm10, %ymm0 + vpsrlq $28, %ymm11, %ymm1 + vpsrlq $54, %ymm12, %ymm2 + vpsrlq $49, %ymm13, %ymm3 + vpsrlq $8, %ymm14, %ymm4 + vpsllq $27, %ymm10, %ymm10 + vpsllq $36, %ymm11, %ymm11 + vpsllq $10, %ymm12, %ymm12 + vpsllq $15, %ymm13, %ymm13 + vpsllq $56, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, 96(%rax) + vmovdqu %ymm1, 128(%rax) + vmovdqu %ymm2, -96(%rcx) + vmovdqu %ymm3, -64(%rcx) + vmovdqu %ymm4, -32(%rcx) + # Row 4 + vpxor (%rcx), %ymm7, %ymm10 + vpxor 32(%rcx), %ymm8, %ymm11 + vpxor 64(%rcx), %ymm9, %ymm12 + vpxor 96(%rcx), %ymm5, %ymm13 + vpxor 128(%rcx), %ymm6, %ymm14 + vpsrlq $2, %ymm10, %ymm0 + vpsrlq $9, %ymm11, %ymm1 + vpsrlq $25, %ymm12, %ymm2 + vpsrlq $23, %ymm13, %ymm3 + vpsrlq $62, %ymm14, %ymm4 + vpsllq $62, %ymm10, %ymm10 + vpsllq $55, %ymm11, %ymm11 + vpsllq $39, %ymm12, %ymm12 + vpsllq $41, %ymm13, %ymm13 + vpsllq $2, %ymm14, %ymm14 + vpor %ymm0, %ymm10, %ymm10 + vpor %ymm1, %ymm11, %ymm11 + vpor %ymm2, %ymm12, %ymm12 + vpor %ymm3, %ymm13, %ymm13 + vpor %ymm4, %ymm14, %ymm14 + vpandn %ymm12, %ymm11, %ymm0 + vpandn %ymm13, %ymm12, %ymm1 + vpandn %ymm14, %ymm13, %ymm2 + vpandn %ymm10, %ymm14, %ymm3 + vpandn %ymm11, %ymm10, %ymm4 + vpxor %ymm10, %ymm0, %ymm0 + vpxor %ymm11, %ymm1, %ymm1 + vpxor %ymm12, %ymm2, %ymm2 + vpxor %ymm13, %ymm3, %ymm3 + vpxor %ymm14, %ymm4, %ymm4 + vmovdqu %ymm0, (%rcx) + vmovdqu %ymm1, 32(%rcx) + vmovdqu %ymm2, 64(%rcx) + vmovdqu %ymm3, 96(%rcx) + vmovdqu %ymm4, 128(%rcx) + subq $0x80, %rdi + vmovdqu %ymm15, (%rdi) + vzeroupper + repz retq +#ifndef __APPLE__ +.size kyber_sha3_blocksx4_avx2,.-kyber_sha3_blocksx4_avx2 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif