diff --git a/configure.ac b/configure.ac index a0f81f755..1426a8df8 100644 --- a/configure.ac +++ b/configure.ac @@ -1478,14 +1478,38 @@ then fi -# AES-ARM +# ARM Assembly AC_ARG_ENABLE([armasm], - [AS_HELP_STRING([--enable-armasm],[Enable wolfSSL ARMv8 ASM support (default: disabled)])], + [AS_HELP_STRING([--enable-armasm],[Enable wolfSSL ARMv8 ASM support (default: disabled). Set to sha512-crypto to use SHA512 instructions with Aarch64 CPU.])], [ ENABLED_ARMASM=$enableval ], [ ENABLED_ARMASM=no ] ) -if test "$ENABLED_ARMASM" = "yes" && test "$ENABLED_ASM" = "yes" +if test "$ENABLED_ARMASM" != "no" && test "$ENABLED_ASM" = "yes" then + for v in `echo $ENABLED_ARMASM | tr "," " "` + do + case $v in + yes) + ;; + sha512-crypto) + case $host_cpu in + *aarch64*) + ;; + *) + AC_MSG_ERROR([SHA512 instructions only available on Aarch64 CPU.]) + break;; + esac + ENABLED_ARMASM_SHA512=yes + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_ARMASM_CRYPTO_SHA512" + ;; + *) + AC_MSG_ERROR([Invalid choice of ARM asm inclusions (yes, sha512-crypto): $ENABLED_ARMASM.]) + break;; + esac + done + ENABLED_ARMASM="yes" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_ARMASM" AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM -DWOLFSSL_NO_HASH_RAW" #Check if mcpu and mfpu values already set if not use default @@ -1531,7 +1555,6 @@ then esac fi - # Xilinx hardened crypto AC_ARG_ENABLE([xilinx], [AS_HELP_STRING([--enable-xilinx],[Enable wolfSSL support for Xilinx hardened crypto(default: disabled)])], @@ -7233,6 +7256,7 @@ echo " * Cavium Nitrox: $ENABLED_CAVIUM" echo " * Cavium Octeon (Sync): $ENABLED_OCTEON_SYNC" echo " * Intel Quick Assist: $ENABLED_INTEL_QA" echo " * ARM ASM: $ENABLED_ARMASM" +echo " * ARM ASM SHA512 Crypto $ENABLED_ARMASM_SHA512" echo " * AES Key Wrap: $ENABLED_AESKEYWRAP" echo " * Write duplicate: $ENABLED_WRITEDUP" echo " * Xilinx Hardware Acc.: $ENABLED_XILINX" diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c index 1214ba429..0cf4d8f95 100644 --- a/wolfcrypt/src/cpuid.c +++ b/wolfcrypt/src/cpuid.c @@ -28,9 +28,7 @@ #include -#if (defined(WOLFSSL_X86_64_BUILD) || defined(USE_INTEL_SPEEDUP) || \ - defined(WOLFSSL_AESNI) || defined(WOLFSSL_SP_X86_64_ASM)) && \ - !defined(WOLFSSL_NO_ASM) +#ifdef HAVE_CPUID_INTEL /* Each platform needs to query info type 1 from cpuid to see if aesni is * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts */ @@ -102,6 +100,9 @@ cpuid_check = 1; } } +#endif + +#ifdef HAVE_CPUID word32 cpuid_get_flags(void) { @@ -124,4 +125,5 @@ { cpuid_flags &= ~flag; } -#endif + +#endif /* HAVE_CPUID */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index d07210a68..218658a0a 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -28,8 +28,9 @@ * cd ../scripts * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S */ -#if defined(WOLFSSL_ARMASM) && defined(HAVE_CURVE25519) +#ifdef WOLFSSL_ARMASM #ifdef __aarch64__ +#ifdef HAVE_CURVE25519 #ifndef __APPLE__ .text .globl fe_init @@ -7471,8 +7472,9 @@ _fe_ge_sub: #ifndef __APPLE__ .size fe_ge_sub,.-fe_ge_sub #endif /* __APPLE__ */ +#endif /* HAVE_CURVE25519 */ #endif /* __aarch64__ */ -#endif /* WOLFSSL_ARMASM && HAVE_CURVE25519 */ +#endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.c b/wolfcrypt/src/port/arm/armv8-curve25519.c index 81352c705..0401813e5 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519.c @@ -28,8 +28,9 @@ * cd ../scripts * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c */ -#if defined(WOLFSSL_ARMASM) && defined(HAVE_CURVE25519) +#ifdef WOLFSSL_ARMASM #ifdef __aarch64__ +#ifdef HAVE_CURVE25519 #include void fe_init() @@ -7216,5 +7217,6 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz ); } +#endif /* HAVE_CURVE25519 */ #endif /* __aarch64__ */ -#endif /* WOLFSSL_ARMASM && HAVE_CURVE25519 */ +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S index 9b28a8231..136df9472 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -28,8 +28,10 @@ * cd ../scripts * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.S */ -#if defined(WOLFSSL_ARMASM) && defined(WOLFSSL_SHA512) +#ifdef WOLFSSL_ARMASM #ifdef __aarch64__ +#ifdef WOLFSSL_SHA512 +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512 #ifndef __APPLE__ .text .type L_SHA512_transform_neon_len_k, %object @@ -141,15 +143,15 @@ L_SHA512_transform_neon_len_ror8: .xword 0x7060504030201, 0x80f0e0d0c0b0a09 #ifndef __APPLE__ .text -.globl Transform_Sha512_Len -.type Transform_Sha512_Len,@function +.globl Transform_Sha512_Len_neon +.type Transform_Sha512_Len_neon,@function .align 2 -Transform_Sha512_Len: +Transform_Sha512_Len_neon: #else .section __TEXT,__text -.globl _Transform_Sha512_Len +.globl _Transform_Sha512_Len_neon .p2align 2 -_Transform_Sha512_Len: +_Transform_Sha512_Len_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 @@ -1090,10 +1092,662 @@ L_sha512_len_neon_start: ldp x29, x30, [sp], #0x80 ret #ifndef __APPLE__ - .size Transform_Sha512_Len,.-Transform_Sha512_Len + .size Transform_Sha512_Len_neon,.-Transform_Sha512_Len_neon #endif /* __APPLE__ */ +#else +#ifndef __APPLE__ + .text + .type L_SHA512_transform_crypto_len_k, %object + .section .rodata + .size L_SHA512_transform_crypto_len_k, 640 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_SHA512_transform_crypto_len_k: + .xword 0x428a2f98d728ae22 + .xword 0x7137449123ef65cd + .xword 0xb5c0fbcfec4d3b2f + .xword 0xe9b5dba58189dbbc + .xword 0x3956c25bf348b538 + .xword 0x59f111f1b605d019 + .xword 0x923f82a4af194f9b + .xword 0xab1c5ed5da6d8118 + .xword 0xd807aa98a3030242 + .xword 0x12835b0145706fbe + .xword 0x243185be4ee4b28c + .xword 0x550c7dc3d5ffb4e2 + .xword 0x72be5d74f27b896f + .xword 0x80deb1fe3b1696b1 + .xword 0x9bdc06a725c71235 + .xword 0xc19bf174cf692694 + .xword 0xe49b69c19ef14ad2 + .xword 0xefbe4786384f25e3 + .xword 0xfc19dc68b8cd5b5 + .xword 0x240ca1cc77ac9c65 + .xword 0x2de92c6f592b0275 + .xword 0x4a7484aa6ea6e483 + .xword 0x5cb0a9dcbd41fbd4 + .xword 0x76f988da831153b5 + .xword 0x983e5152ee66dfab + .xword 0xa831c66d2db43210 + .xword 0xb00327c898fb213f + .xword 0xbf597fc7beef0ee4 + .xword 0xc6e00bf33da88fc2 + .xword 0xd5a79147930aa725 + .xword 0x6ca6351e003826f + .xword 0x142929670a0e6e70 + .xword 0x27b70a8546d22ffc + .xword 0x2e1b21385c26c926 + .xword 0x4d2c6dfc5ac42aed + .xword 0x53380d139d95b3df + .xword 0x650a73548baf63de + .xword 0x766a0abb3c77b2a8 + .xword 0x81c2c92e47edaee6 + .xword 0x92722c851482353b + .xword 0xa2bfe8a14cf10364 + .xword 0xa81a664bbc423001 + .xword 0xc24b8b70d0f89791 + .xword 0xc76c51a30654be30 + .xword 0xd192e819d6ef5218 + .xword 0xd69906245565a910 + .xword 0xf40e35855771202a + .xword 0x106aa07032bbd1b8 + .xword 0x19a4c116b8d2d0c8 + .xword 0x1e376c085141ab53 + .xword 0x2748774cdf8eeb99 + .xword 0x34b0bcb5e19b48a8 + .xword 0x391c0cb3c5c95a63 + .xword 0x4ed8aa4ae3418acb + .xword 0x5b9cca4f7763e373 + .xword 0x682e6ff3d6b2b8a3 + .xword 0x748f82ee5defb2fc + .xword 0x78a5636f43172f60 + .xword 0x84c87814a1f0ab72 + .xword 0x8cc702081a6439ec + .xword 0x90befffa23631e28 + .xword 0xa4506cebde82bde9 + .xword 0xbef9a3f7b2c67915 + .xword 0xc67178f2e372532b + .xword 0xca273eceea26619c + .xword 0xd186b8c721c0c207 + .xword 0xeada7dd6cde0eb1e + .xword 0xf57d4f7fee6ed178 + .xword 0x6f067aa72176fba + .xword 0xa637dc5a2c898a6 + .xword 0x113f9804bef90dae + .xword 0x1b710b35131c471b + .xword 0x28db77f523047d84 + .xword 0x32caab7b40c72493 + .xword 0x3c9ebe0a15c9bebc + .xword 0x431d67c49c100d4c + .xword 0x4cc5d4becb3e42b6 + .xword 0x597f299cfc657e2a + .xword 0x5fcb6fab3ad6faec + .xword 0x6c44198c4a475817 +#ifndef __APPLE__ +.text +.globl Transform_Sha512_Len_crypto +.type Transform_Sha512_Len_crypto,@function +.align 2 +Transform_Sha512_Len_crypto: +#else +.section __TEXT,__text +.globl _Transform_Sha512_Len_crypto +.p2align 2 +_Transform_Sha512_Len_crypto: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-208]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] + stp d16, d17, [x29, #80] + stp d18, d19, [x29, #96] + stp d20, d21, [x29, #112] + stp d22, d23, [x29, #128] + stp d24, d25, [x29, #144] + stp d26, d27, [x29, #160] + stp d28, d29, [x29, #176] + stp d30, d31, [x29, #192] +#ifndef __APPLE__ + adrp x4, L_SHA512_transform_crypto_len_k + add x4, x4, :lo12:L_SHA512_transform_crypto_len_k +#else + adrp x4, L_SHA512_transform_crypto_len_k@PAGE + add x4, x4, :lo12:L_SHA512_transform_crypto_len_k@PAGEOFF +#endif /* __APPLE__ */ + # Load first 16 64-bit words of K permantly + ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x4], #0x40 + ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x4], #0x40 + # Load digest into working vars + ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0] + # Start of loop processing a block +L_sha512_len_crypto_begin: + mov x3, x4 + # Load W + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #0x40 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #0x40 + rev64 v0.16b, v0.16b + rev64 v1.16b, v1.16b + rev64 v2.16b, v2.16b + rev64 v3.16b, v3.16b + rev64 v4.16b, v4.16b + rev64 v5.16b, v5.16b + rev64 v6.16b, v6.16b + rev64 v7.16b, v7.16b + # Copy digest to add in at end + mov v28.16b, v24.16b + mov v29.16b, v25.16b + mov v30.16b, v26.16b + mov v31.16b, v27.16b + # Start of 16 rounds + # Round 0 + add v20.2d, v0.2d, v8.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v26.16b, v27.16b, #8 + ext v22.16b, v25.16b, v26.16b, #8 + add v27.2d, v27.2d, v20.2d + sha512h q27, q21, v22.2d + add v23.2d, v25.2d, v27.2d + sha512h2 q27, q25, v24.2d + # Round 1 + add v20.2d, v1.2d, v9.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v23.16b, v26.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + add v26.2d, v26.2d, v20.2d + sha512h q26, q21, v22.2d + add v25.2d, v24.2d, v26.2d + sha512h2 q26, q24, v27.2d + # Round 2 + add v20.2d, v2.2d, v10.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v25.16b, v23.16b, #8 + ext v22.16b, v27.16b, v25.16b, #8 + add v23.2d, v23.2d, v20.2d + sha512h q23, q21, v22.2d + add v24.2d, v27.2d, v23.2d + sha512h2 q23, q27, v26.2d + # Round 3 + add v20.2d, v3.2d, v11.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v24.16b, v25.16b, #8 + ext v22.16b, v26.16b, v24.16b, #8 + add v25.2d, v25.2d, v20.2d + sha512h q25, q21, v22.2d + add v27.2d, v26.2d, v25.2d + sha512h2 q25, q26, v23.2d + # Round 4 + add v20.2d, v4.2d, v12.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v27.16b, v24.16b, #8 + ext v22.16b, v23.16b, v27.16b, #8 + add v24.2d, v24.2d, v20.2d + sha512h q24, q21, v22.2d + add v26.2d, v23.2d, v24.2d + sha512h2 q24, q23, v25.2d + # Round 5 + add v20.2d, v5.2d, v13.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v26.16b, v27.16b, #8 + ext v22.16b, v25.16b, v26.16b, #8 + add v27.2d, v27.2d, v20.2d + sha512h q27, q21, v22.2d + add v23.2d, v25.2d, v27.2d + sha512h2 q27, q25, v24.2d + # Round 6 + add v20.2d, v6.2d, v14.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v23.16b, v26.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + add v26.2d, v26.2d, v20.2d + sha512h q26, q21, v22.2d + add v25.2d, v24.2d, v26.2d + sha512h2 q26, q24, v27.2d + # Round 7 + add v20.2d, v7.2d, v15.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v25.16b, v23.16b, #8 + ext v22.16b, v27.16b, v25.16b, #8 + add v23.2d, v23.2d, v20.2d + sha512h q23, q21, v22.2d + add v24.2d, v27.2d, v23.2d + sha512h2 q23, q27, v26.2d + # Load next 8 64-bit words of K + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40 + # Round 8 + sha512su0 v0.2d, v1.2d + ext v21.16b, v4.16b, v5.16b, #8 + sha512su1 v0.2d, v7.2d, v21.2d + add v20.2d, v0.2d, v16.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v24.16b, v25.16b, #8 + ext v22.16b, v26.16b, v24.16b, #8 + add v25.2d, v25.2d, v20.2d + sha512h q25, q21, v22.2d + add v27.2d, v26.2d, v25.2d + sha512h2 q25, q26, v23.2d + # Round 9 + sha512su0 v1.2d, v2.2d + ext v21.16b, v5.16b, v6.16b, #8 + sha512su1 v1.2d, v0.2d, v21.2d + add v20.2d, v1.2d, v17.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v27.16b, v24.16b, #8 + ext v22.16b, v23.16b, v27.16b, #8 + add v24.2d, v24.2d, v20.2d + sha512h q24, q21, v22.2d + add v26.2d, v23.2d, v24.2d + sha512h2 q24, q23, v25.2d + # Round 10 + sha512su0 v2.2d, v3.2d + ext v21.16b, v6.16b, v7.16b, #8 + sha512su1 v2.2d, v1.2d, v21.2d + add v20.2d, v2.2d, v18.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v26.16b, v27.16b, #8 + ext v22.16b, v25.16b, v26.16b, #8 + add v27.2d, v27.2d, v20.2d + sha512h q27, q21, v22.2d + add v23.2d, v25.2d, v27.2d + sha512h2 q27, q25, v24.2d + # Round 11 + sha512su0 v3.2d, v4.2d + ext v21.16b, v7.16b, v0.16b, #8 + sha512su1 v3.2d, v2.2d, v21.2d + add v20.2d, v3.2d, v19.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v23.16b, v26.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + add v26.2d, v26.2d, v20.2d + sha512h q26, q21, v22.2d + add v25.2d, v24.2d, v26.2d + sha512h2 q26, q24, v27.2d + # Load next 8 64-bit words of K + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40 + # Round 12 + sha512su0 v4.2d, v5.2d + ext v21.16b, v0.16b, v1.16b, #8 + sha512su1 v4.2d, v3.2d, v21.2d + add v20.2d, v4.2d, v16.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v25.16b, v23.16b, #8 + ext v22.16b, v27.16b, v25.16b, #8 + add v23.2d, v23.2d, v20.2d + sha512h q23, q21, v22.2d + add v24.2d, v27.2d, v23.2d + sha512h2 q23, q27, v26.2d + # Round 13 + sha512su0 v5.2d, v6.2d + ext v21.16b, v1.16b, v2.16b, #8 + sha512su1 v5.2d, v4.2d, v21.2d + add v20.2d, v5.2d, v17.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v24.16b, v25.16b, #8 + ext v22.16b, v26.16b, v24.16b, #8 + add v25.2d, v25.2d, v20.2d + sha512h q25, q21, v22.2d + add v27.2d, v26.2d, v25.2d + sha512h2 q25, q26, v23.2d + # Round 14 + sha512su0 v6.2d, v7.2d + ext v21.16b, v2.16b, v3.16b, #8 + sha512su1 v6.2d, v5.2d, v21.2d + add v20.2d, v6.2d, v18.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v27.16b, v24.16b, #8 + ext v22.16b, v23.16b, v27.16b, #8 + add v24.2d, v24.2d, v20.2d + sha512h q24, q21, v22.2d + add v26.2d, v23.2d, v24.2d + sha512h2 q24, q23, v25.2d + # Round 15 + sha512su0 v7.2d, v0.2d + ext v21.16b, v3.16b, v4.16b, #8 + sha512su1 v7.2d, v6.2d, v21.2d + add v20.2d, v7.2d, v19.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v26.16b, v27.16b, #8 + ext v22.16b, v25.16b, v26.16b, #8 + add v27.2d, v27.2d, v20.2d + sha512h q27, q21, v22.2d + add v23.2d, v25.2d, v27.2d + sha512h2 q27, q25, v24.2d + # Load next 8 64-bit words of K + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40 + # Round 16 + sha512su0 v0.2d, v1.2d + ext v21.16b, v4.16b, v5.16b, #8 + sha512su1 v0.2d, v7.2d, v21.2d + add v20.2d, v0.2d, v16.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v23.16b, v26.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + add v26.2d, v26.2d, v20.2d + sha512h q26, q21, v22.2d + add v25.2d, v24.2d, v26.2d + sha512h2 q26, q24, v27.2d + # Round 17 + sha512su0 v1.2d, v2.2d + ext v21.16b, v5.16b, v6.16b, #8 + sha512su1 v1.2d, v0.2d, v21.2d + add v20.2d, v1.2d, v17.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v25.16b, v23.16b, #8 + ext v22.16b, v27.16b, v25.16b, #8 + add v23.2d, v23.2d, v20.2d + sha512h q23, q21, v22.2d + add v24.2d, v27.2d, v23.2d + sha512h2 q23, q27, v26.2d + # Round 18 + sha512su0 v2.2d, v3.2d + ext v21.16b, v6.16b, v7.16b, #8 + sha512su1 v2.2d, v1.2d, v21.2d + add v20.2d, v2.2d, v18.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v24.16b, v25.16b, #8 + ext v22.16b, v26.16b, v24.16b, #8 + add v25.2d, v25.2d, v20.2d + sha512h q25, q21, v22.2d + add v27.2d, v26.2d, v25.2d + sha512h2 q25, q26, v23.2d + # Round 19 + sha512su0 v3.2d, v4.2d + ext v21.16b, v7.16b, v0.16b, #8 + sha512su1 v3.2d, v2.2d, v21.2d + add v20.2d, v3.2d, v19.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v27.16b, v24.16b, #8 + ext v22.16b, v23.16b, v27.16b, #8 + add v24.2d, v24.2d, v20.2d + sha512h q24, q21, v22.2d + add v26.2d, v23.2d, v24.2d + sha512h2 q24, q23, v25.2d + # Load next 8 64-bit words of K + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40 + # Round 20 + sha512su0 v4.2d, v5.2d + ext v21.16b, v0.16b, v1.16b, #8 + sha512su1 v4.2d, v3.2d, v21.2d + add v20.2d, v4.2d, v16.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v26.16b, v27.16b, #8 + ext v22.16b, v25.16b, v26.16b, #8 + add v27.2d, v27.2d, v20.2d + sha512h q27, q21, v22.2d + add v23.2d, v25.2d, v27.2d + sha512h2 q27, q25, v24.2d + # Round 21 + sha512su0 v5.2d, v6.2d + ext v21.16b, v1.16b, v2.16b, #8 + sha512su1 v5.2d, v4.2d, v21.2d + add v20.2d, v5.2d, v17.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v23.16b, v26.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + add v26.2d, v26.2d, v20.2d + sha512h q26, q21, v22.2d + add v25.2d, v24.2d, v26.2d + sha512h2 q26, q24, v27.2d + # Round 22 + sha512su0 v6.2d, v7.2d + ext v21.16b, v2.16b, v3.16b, #8 + sha512su1 v6.2d, v5.2d, v21.2d + add v20.2d, v6.2d, v18.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v25.16b, v23.16b, #8 + ext v22.16b, v27.16b, v25.16b, #8 + add v23.2d, v23.2d, v20.2d + sha512h q23, q21, v22.2d + add v24.2d, v27.2d, v23.2d + sha512h2 q23, q27, v26.2d + # Round 23 + sha512su0 v7.2d, v0.2d + ext v21.16b, v3.16b, v4.16b, #8 + sha512su1 v7.2d, v6.2d, v21.2d + add v20.2d, v7.2d, v19.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v24.16b, v25.16b, #8 + ext v22.16b, v26.16b, v24.16b, #8 + add v25.2d, v25.2d, v20.2d + sha512h q25, q21, v22.2d + add v27.2d, v26.2d, v25.2d + sha512h2 q25, q26, v23.2d + # Load next 8 64-bit words of K + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40 + # Round 24 + sha512su0 v0.2d, v1.2d + ext v21.16b, v4.16b, v5.16b, #8 + sha512su1 v0.2d, v7.2d, v21.2d + add v20.2d, v0.2d, v16.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v27.16b, v24.16b, #8 + ext v22.16b, v23.16b, v27.16b, #8 + add v24.2d, v24.2d, v20.2d + sha512h q24, q21, v22.2d + add v26.2d, v23.2d, v24.2d + sha512h2 q24, q23, v25.2d + # Round 25 + sha512su0 v1.2d, v2.2d + ext v21.16b, v5.16b, v6.16b, #8 + sha512su1 v1.2d, v0.2d, v21.2d + add v20.2d, v1.2d, v17.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v26.16b, v27.16b, #8 + ext v22.16b, v25.16b, v26.16b, #8 + add v27.2d, v27.2d, v20.2d + sha512h q27, q21, v22.2d + add v23.2d, v25.2d, v27.2d + sha512h2 q27, q25, v24.2d + # Round 26 + sha512su0 v2.2d, v3.2d + ext v21.16b, v6.16b, v7.16b, #8 + sha512su1 v2.2d, v1.2d, v21.2d + add v20.2d, v2.2d, v18.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v23.16b, v26.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + add v26.2d, v26.2d, v20.2d + sha512h q26, q21, v22.2d + add v25.2d, v24.2d, v26.2d + sha512h2 q26, q24, v27.2d + # Round 27 + sha512su0 v3.2d, v4.2d + ext v21.16b, v7.16b, v0.16b, #8 + sha512su1 v3.2d, v2.2d, v21.2d + add v20.2d, v3.2d, v19.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v25.16b, v23.16b, #8 + ext v22.16b, v27.16b, v25.16b, #8 + add v23.2d, v23.2d, v20.2d + sha512h q23, q21, v22.2d + add v24.2d, v27.2d, v23.2d + sha512h2 q23, q27, v26.2d + # Load next 8 64-bit words of K + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40 + # Round 28 + sha512su0 v4.2d, v5.2d + ext v21.16b, v0.16b, v1.16b, #8 + sha512su1 v4.2d, v3.2d, v21.2d + add v20.2d, v4.2d, v16.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v24.16b, v25.16b, #8 + ext v22.16b, v26.16b, v24.16b, #8 + add v25.2d, v25.2d, v20.2d + sha512h q25, q21, v22.2d + add v27.2d, v26.2d, v25.2d + sha512h2 q25, q26, v23.2d + # Round 29 + sha512su0 v5.2d, v6.2d + ext v21.16b, v1.16b, v2.16b, #8 + sha512su1 v5.2d, v4.2d, v21.2d + add v20.2d, v5.2d, v17.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v27.16b, v24.16b, #8 + ext v22.16b, v23.16b, v27.16b, #8 + add v24.2d, v24.2d, v20.2d + sha512h q24, q21, v22.2d + add v26.2d, v23.2d, v24.2d + sha512h2 q24, q23, v25.2d + # Round 30 + sha512su0 v6.2d, v7.2d + ext v21.16b, v2.16b, v3.16b, #8 + sha512su1 v6.2d, v5.2d, v21.2d + add v20.2d, v6.2d, v18.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v26.16b, v27.16b, #8 + ext v22.16b, v25.16b, v26.16b, #8 + add v27.2d, v27.2d, v20.2d + sha512h q27, q21, v22.2d + add v23.2d, v25.2d, v27.2d + sha512h2 q27, q25, v24.2d + # Round 31 + sha512su0 v7.2d, v0.2d + ext v21.16b, v3.16b, v4.16b, #8 + sha512su1 v7.2d, v6.2d, v21.2d + add v20.2d, v7.2d, v19.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v23.16b, v26.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + add v26.2d, v26.2d, v20.2d + sha512h q26, q21, v22.2d + add v25.2d, v24.2d, v26.2d + sha512h2 q26, q24, v27.2d + # Load next 8 64-bit words of K + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40 + # Round 32 + sha512su0 v0.2d, v1.2d + ext v21.16b, v4.16b, v5.16b, #8 + sha512su1 v0.2d, v7.2d, v21.2d + add v20.2d, v0.2d, v16.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v25.16b, v23.16b, #8 + ext v22.16b, v27.16b, v25.16b, #8 + add v23.2d, v23.2d, v20.2d + sha512h q23, q21, v22.2d + add v24.2d, v27.2d, v23.2d + sha512h2 q23, q27, v26.2d + # Round 33 + sha512su0 v1.2d, v2.2d + ext v21.16b, v5.16b, v6.16b, #8 + sha512su1 v1.2d, v0.2d, v21.2d + add v20.2d, v1.2d, v17.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v24.16b, v25.16b, #8 + ext v22.16b, v26.16b, v24.16b, #8 + add v25.2d, v25.2d, v20.2d + sha512h q25, q21, v22.2d + add v27.2d, v26.2d, v25.2d + sha512h2 q25, q26, v23.2d + # Round 34 + sha512su0 v2.2d, v3.2d + ext v21.16b, v6.16b, v7.16b, #8 + sha512su1 v2.2d, v1.2d, v21.2d + add v20.2d, v2.2d, v18.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v27.16b, v24.16b, #8 + ext v22.16b, v23.16b, v27.16b, #8 + add v24.2d, v24.2d, v20.2d + sha512h q24, q21, v22.2d + add v26.2d, v23.2d, v24.2d + sha512h2 q24, q23, v25.2d + # Round 35 + sha512su0 v3.2d, v4.2d + ext v21.16b, v7.16b, v0.16b, #8 + sha512su1 v3.2d, v2.2d, v21.2d + add v20.2d, v3.2d, v19.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v26.16b, v27.16b, #8 + ext v22.16b, v25.16b, v26.16b, #8 + add v27.2d, v27.2d, v20.2d + sha512h q27, q21, v22.2d + add v23.2d, v25.2d, v27.2d + sha512h2 q27, q25, v24.2d + # Load next 8 64-bit words of K + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40 + # Round 36 + sha512su0 v4.2d, v5.2d + ext v21.16b, v0.16b, v1.16b, #8 + sha512su1 v4.2d, v3.2d, v21.2d + add v20.2d, v4.2d, v16.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v23.16b, v26.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + add v26.2d, v26.2d, v20.2d + sha512h q26, q21, v22.2d + add v25.2d, v24.2d, v26.2d + sha512h2 q26, q24, v27.2d + # Round 37 + sha512su0 v5.2d, v6.2d + ext v21.16b, v1.16b, v2.16b, #8 + sha512su1 v5.2d, v4.2d, v21.2d + add v20.2d, v5.2d, v17.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v25.16b, v23.16b, #8 + ext v22.16b, v27.16b, v25.16b, #8 + add v23.2d, v23.2d, v20.2d + sha512h q23, q21, v22.2d + add v24.2d, v27.2d, v23.2d + sha512h2 q23, q27, v26.2d + # Round 38 + sha512su0 v6.2d, v7.2d + ext v21.16b, v2.16b, v3.16b, #8 + sha512su1 v6.2d, v5.2d, v21.2d + add v20.2d, v6.2d, v18.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v24.16b, v25.16b, #8 + ext v22.16b, v26.16b, v24.16b, #8 + add v25.2d, v25.2d, v20.2d + sha512h q25, q21, v22.2d + add v27.2d, v26.2d, v25.2d + sha512h2 q25, q26, v23.2d + # Round 39 + sha512su0 v7.2d, v0.2d + ext v21.16b, v3.16b, v4.16b, #8 + sha512su1 v7.2d, v6.2d, v21.2d + add v20.2d, v7.2d, v19.2d + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v27.16b, v24.16b, #8 + ext v22.16b, v23.16b, v27.16b, #8 + add v24.2d, v24.2d, v20.2d + sha512h q24, q21, v22.2d + add v26.2d, v23.2d, v24.2d + sha512h2 q24, q23, v25.2d + add v27.2d, v27.2d, v31.2d + add v26.2d, v26.2d, v30.2d + add v25.2d, v25.2d, v29.2d + add v24.2d, v24.2d, v28.2d + subs w2, w2, #0x80 + bne L_sha512_len_crypto_begin + # Store digest back + st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp d16, d17, [x29, #80] + ldp d18, d19, [x29, #96] + ldp d20, d21, [x29, #112] + ldp d22, d23, [x29, #128] + ldp d24, d25, [x29, #144] + ldp d26, d27, [x29, #160] + ldp d28, d29, [x29, #176] + ldp d30, d31, [x29, #192] + ldp x29, x30, [sp], #0xd0 + ret +#ifndef __APPLE__ + .size Transform_Sha512_Len_crypto,.-Transform_Sha512_Len_crypto +#endif /* __APPLE__ */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA512 */ +#endif /* WOLFSSL_SHA512 */ #endif /* __aarch64__ */ -#endif /* WOLFSSL_ARMASM && WOLFSSL_SHA512 */ +#endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-sha512-asm.c index 87e7496f0..720756a5b 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.c +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.c @@ -28,10 +28,12 @@ * cd ../scripts * ruby ./sha2/sha512.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-sha512-asm.c */ -#if defined(WOLFSSL_ARMASM) && defined(WOLFSSL_SHA512) +#ifdef WOLFSSL_ARMASM #ifdef __aarch64__ #include +#ifdef WOLFSSL_SHA512 +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512 static const uint64_t L_SHA512_transform_neon_len_k[] = { 0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, @@ -120,7 +122,7 @@ static const uint64_t L_SHA512_transform_neon_len_ror8[] = { 0x80f0e0d0c0b0a09UL, }; -void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len) { __asm__ __volatile__ ( #ifndef __APPLE__ @@ -1049,5 +1051,613 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) ); } +#else +static const uint64_t L_SHA512_transform_crypto_len_k[] = { + 0x428a2f98d728ae22UL, + 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, + 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, + 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, + 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, + 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, + 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, + 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, + 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, + 0xefbe4786384f25e3UL, + 0xfc19dc68b8cd5b5UL, + 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, + 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, + 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, + 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, + 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, + 0xd5a79147930aa725UL, + 0x6ca6351e003826fUL, + 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, + 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, + 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, + 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, + 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, + 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, + 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, + 0xd69906245565a910UL, + 0xf40e35855771202aUL, + 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, + 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, + 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, + 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, + 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, + 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, + 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, + 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, + 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, + 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, + 0xf57d4f7fee6ed178UL, + 0x6f067aa72176fbaUL, + 0xa637dc5a2c898a6UL, + 0x113f9804bef90daeUL, + 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, + 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, + 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, + 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, + 0x6c44198c4a475817UL, +}; + +void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data, word32 len) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x4, %[L_SHA512_transform_crypto_len_k]\n\t" + "add x4, x4, :lo12:%[L_SHA512_transform_crypto_len_k]\n\t" +#else + "adrp x4, %[L_SHA512_transform_crypto_len_k]@PAGE\n\t" + "add x4, x4, %[L_SHA512_transform_crypto_len_k]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + /* Load first 16 64-bit words of K permantly */ + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x4], #0x40\n\t" + "ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x4], #0x40\n\t" + /* Load digest into working vars */ + "ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [%x[sha512]]\n\t" + /* Start of loop processing a block */ + "\n" + "L_sha512_len_crypto_begin_%=: \n\t" + "mov x3, x4\n\t" + /* Load W */ + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t" + "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t" + "rev64 v0.16b, v0.16b\n\t" + "rev64 v1.16b, v1.16b\n\t" + "rev64 v2.16b, v2.16b\n\t" + "rev64 v3.16b, v3.16b\n\t" + "rev64 v4.16b, v4.16b\n\t" + "rev64 v5.16b, v5.16b\n\t" + "rev64 v6.16b, v6.16b\n\t" + "rev64 v7.16b, v7.16b\n\t" + /* Copy digest to add in at end */ + "mov v28.16b, v24.16b\n\t" + "mov v29.16b, v25.16b\n\t" + "mov v30.16b, v26.16b\n\t" + "mov v31.16b, v27.16b\n\t" + /* Start of 16 rounds */ + /* Round 0 */ + "add v20.2d, v0.2d, v8.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v26.16b, v27.16b, #8\n\t" + "ext v22.16b, v25.16b, v26.16b, #8\n\t" + "add v27.2d, v27.2d, v20.2d\n\t" + "sha512h q27, q21, v22.2d\n\t" + "add v23.2d, v25.2d, v27.2d\n\t" + "sha512h2 q27, q25, v24.2d\n\t" + /* Round 1 */ + "add v20.2d, v1.2d, v9.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v23.16b, v26.16b, #8\n\t" + "ext v22.16b, v24.16b, v23.16b, #8\n\t" + "add v26.2d, v26.2d, v20.2d\n\t" + "sha512h q26, q21, v22.2d\n\t" + "add v25.2d, v24.2d, v26.2d\n\t" + "sha512h2 q26, q24, v27.2d\n\t" + /* Round 2 */ + "add v20.2d, v2.2d, v10.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v25.16b, v23.16b, #8\n\t" + "ext v22.16b, v27.16b, v25.16b, #8\n\t" + "add v23.2d, v23.2d, v20.2d\n\t" + "sha512h q23, q21, v22.2d\n\t" + "add v24.2d, v27.2d, v23.2d\n\t" + "sha512h2 q23, q27, v26.2d\n\t" + /* Round 3 */ + "add v20.2d, v3.2d, v11.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v24.16b, v25.16b, #8\n\t" + "ext v22.16b, v26.16b, v24.16b, #8\n\t" + "add v25.2d, v25.2d, v20.2d\n\t" + "sha512h q25, q21, v22.2d\n\t" + "add v27.2d, v26.2d, v25.2d\n\t" + "sha512h2 q25, q26, v23.2d\n\t" + /* Round 4 */ + "add v20.2d, v4.2d, v12.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v27.16b, v24.16b, #8\n\t" + "ext v22.16b, v23.16b, v27.16b, #8\n\t" + "add v24.2d, v24.2d, v20.2d\n\t" + "sha512h q24, q21, v22.2d\n\t" + "add v26.2d, v23.2d, v24.2d\n\t" + "sha512h2 q24, q23, v25.2d\n\t" + /* Round 5 */ + "add v20.2d, v5.2d, v13.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v26.16b, v27.16b, #8\n\t" + "ext v22.16b, v25.16b, v26.16b, #8\n\t" + "add v27.2d, v27.2d, v20.2d\n\t" + "sha512h q27, q21, v22.2d\n\t" + "add v23.2d, v25.2d, v27.2d\n\t" + "sha512h2 q27, q25, v24.2d\n\t" + /* Round 6 */ + "add v20.2d, v6.2d, v14.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v23.16b, v26.16b, #8\n\t" + "ext v22.16b, v24.16b, v23.16b, #8\n\t" + "add v26.2d, v26.2d, v20.2d\n\t" + "sha512h q26, q21, v22.2d\n\t" + "add v25.2d, v24.2d, v26.2d\n\t" + "sha512h2 q26, q24, v27.2d\n\t" + /* Round 7 */ + "add v20.2d, v7.2d, v15.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v25.16b, v23.16b, #8\n\t" + "ext v22.16b, v27.16b, v25.16b, #8\n\t" + "add v23.2d, v23.2d, v20.2d\n\t" + "sha512h q23, q21, v22.2d\n\t" + "add v24.2d, v27.2d, v23.2d\n\t" + "sha512h2 q23, q27, v26.2d\n\t" + /* Load next 8 64-bit words of K */ + "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40\n\t" + /* Round 8 */ + "sha512su0 v0.2d, v1.2d\n\t" + "ext v21.16b, v4.16b, v5.16b, #8\n\t" + "sha512su1 v0.2d, v7.2d, v21.2d\n\t" + "add v20.2d, v0.2d, v16.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v24.16b, v25.16b, #8\n\t" + "ext v22.16b, v26.16b, v24.16b, #8\n\t" + "add v25.2d, v25.2d, v20.2d\n\t" + "sha512h q25, q21, v22.2d\n\t" + "add v27.2d, v26.2d, v25.2d\n\t" + "sha512h2 q25, q26, v23.2d\n\t" + /* Round 9 */ + "sha512su0 v1.2d, v2.2d\n\t" + "ext v21.16b, v5.16b, v6.16b, #8\n\t" + "sha512su1 v1.2d, v0.2d, v21.2d\n\t" + "add v20.2d, v1.2d, v17.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v27.16b, v24.16b, #8\n\t" + "ext v22.16b, v23.16b, v27.16b, #8\n\t" + "add v24.2d, v24.2d, v20.2d\n\t" + "sha512h q24, q21, v22.2d\n\t" + "add v26.2d, v23.2d, v24.2d\n\t" + "sha512h2 q24, q23, v25.2d\n\t" + /* Round 10 */ + "sha512su0 v2.2d, v3.2d\n\t" + "ext v21.16b, v6.16b, v7.16b, #8\n\t" + "sha512su1 v2.2d, v1.2d, v21.2d\n\t" + "add v20.2d, v2.2d, v18.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v26.16b, v27.16b, #8\n\t" + "ext v22.16b, v25.16b, v26.16b, #8\n\t" + "add v27.2d, v27.2d, v20.2d\n\t" + "sha512h q27, q21, v22.2d\n\t" + "add v23.2d, v25.2d, v27.2d\n\t" + "sha512h2 q27, q25, v24.2d\n\t" + /* Round 11 */ + "sha512su0 v3.2d, v4.2d\n\t" + "ext v21.16b, v7.16b, v0.16b, #8\n\t" + "sha512su1 v3.2d, v2.2d, v21.2d\n\t" + "add v20.2d, v3.2d, v19.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v23.16b, v26.16b, #8\n\t" + "ext v22.16b, v24.16b, v23.16b, #8\n\t" + "add v26.2d, v26.2d, v20.2d\n\t" + "sha512h q26, q21, v22.2d\n\t" + "add v25.2d, v24.2d, v26.2d\n\t" + "sha512h2 q26, q24, v27.2d\n\t" + /* Load next 8 64-bit words of K */ + "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40\n\t" + /* Round 12 */ + "sha512su0 v4.2d, v5.2d\n\t" + "ext v21.16b, v0.16b, v1.16b, #8\n\t" + "sha512su1 v4.2d, v3.2d, v21.2d\n\t" + "add v20.2d, v4.2d, v16.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v25.16b, v23.16b, #8\n\t" + "ext v22.16b, v27.16b, v25.16b, #8\n\t" + "add v23.2d, v23.2d, v20.2d\n\t" + "sha512h q23, q21, v22.2d\n\t" + "add v24.2d, v27.2d, v23.2d\n\t" + "sha512h2 q23, q27, v26.2d\n\t" + /* Round 13 */ + "sha512su0 v5.2d, v6.2d\n\t" + "ext v21.16b, v1.16b, v2.16b, #8\n\t" + "sha512su1 v5.2d, v4.2d, v21.2d\n\t" + "add v20.2d, v5.2d, v17.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v24.16b, v25.16b, #8\n\t" + "ext v22.16b, v26.16b, v24.16b, #8\n\t" + "add v25.2d, v25.2d, v20.2d\n\t" + "sha512h q25, q21, v22.2d\n\t" + "add v27.2d, v26.2d, v25.2d\n\t" + "sha512h2 q25, q26, v23.2d\n\t" + /* Round 14 */ + "sha512su0 v6.2d, v7.2d\n\t" + "ext v21.16b, v2.16b, v3.16b, #8\n\t" + "sha512su1 v6.2d, v5.2d, v21.2d\n\t" + "add v20.2d, v6.2d, v18.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v27.16b, v24.16b, #8\n\t" + "ext v22.16b, v23.16b, v27.16b, #8\n\t" + "add v24.2d, v24.2d, v20.2d\n\t" + "sha512h q24, q21, v22.2d\n\t" + "add v26.2d, v23.2d, v24.2d\n\t" + "sha512h2 q24, q23, v25.2d\n\t" + /* Round 15 */ + "sha512su0 v7.2d, v0.2d\n\t" + "ext v21.16b, v3.16b, v4.16b, #8\n\t" + "sha512su1 v7.2d, v6.2d, v21.2d\n\t" + "add v20.2d, v7.2d, v19.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v26.16b, v27.16b, #8\n\t" + "ext v22.16b, v25.16b, v26.16b, #8\n\t" + "add v27.2d, v27.2d, v20.2d\n\t" + "sha512h q27, q21, v22.2d\n\t" + "add v23.2d, v25.2d, v27.2d\n\t" + "sha512h2 q27, q25, v24.2d\n\t" + /* Load next 8 64-bit words of K */ + "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40\n\t" + /* Round 16 */ + "sha512su0 v0.2d, v1.2d\n\t" + "ext v21.16b, v4.16b, v5.16b, #8\n\t" + "sha512su1 v0.2d, v7.2d, v21.2d\n\t" + "add v20.2d, v0.2d, v16.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v23.16b, v26.16b, #8\n\t" + "ext v22.16b, v24.16b, v23.16b, #8\n\t" + "add v26.2d, v26.2d, v20.2d\n\t" + "sha512h q26, q21, v22.2d\n\t" + "add v25.2d, v24.2d, v26.2d\n\t" + "sha512h2 q26, q24, v27.2d\n\t" + /* Round 17 */ + "sha512su0 v1.2d, v2.2d\n\t" + "ext v21.16b, v5.16b, v6.16b, #8\n\t" + "sha512su1 v1.2d, v0.2d, v21.2d\n\t" + "add v20.2d, v1.2d, v17.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v25.16b, v23.16b, #8\n\t" + "ext v22.16b, v27.16b, v25.16b, #8\n\t" + "add v23.2d, v23.2d, v20.2d\n\t" + "sha512h q23, q21, v22.2d\n\t" + "add v24.2d, v27.2d, v23.2d\n\t" + "sha512h2 q23, q27, v26.2d\n\t" + /* Round 18 */ + "sha512su0 v2.2d, v3.2d\n\t" + "ext v21.16b, v6.16b, v7.16b, #8\n\t" + "sha512su1 v2.2d, v1.2d, v21.2d\n\t" + "add v20.2d, v2.2d, v18.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v24.16b, v25.16b, #8\n\t" + "ext v22.16b, v26.16b, v24.16b, #8\n\t" + "add v25.2d, v25.2d, v20.2d\n\t" + "sha512h q25, q21, v22.2d\n\t" + "add v27.2d, v26.2d, v25.2d\n\t" + "sha512h2 q25, q26, v23.2d\n\t" + /* Round 19 */ + "sha512su0 v3.2d, v4.2d\n\t" + "ext v21.16b, v7.16b, v0.16b, #8\n\t" + "sha512su1 v3.2d, v2.2d, v21.2d\n\t" + "add v20.2d, v3.2d, v19.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v27.16b, v24.16b, #8\n\t" + "ext v22.16b, v23.16b, v27.16b, #8\n\t" + "add v24.2d, v24.2d, v20.2d\n\t" + "sha512h q24, q21, v22.2d\n\t" + "add v26.2d, v23.2d, v24.2d\n\t" + "sha512h2 q24, q23, v25.2d\n\t" + /* Load next 8 64-bit words of K */ + "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40\n\t" + /* Round 20 */ + "sha512su0 v4.2d, v5.2d\n\t" + "ext v21.16b, v0.16b, v1.16b, #8\n\t" + "sha512su1 v4.2d, v3.2d, v21.2d\n\t" + "add v20.2d, v4.2d, v16.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v26.16b, v27.16b, #8\n\t" + "ext v22.16b, v25.16b, v26.16b, #8\n\t" + "add v27.2d, v27.2d, v20.2d\n\t" + "sha512h q27, q21, v22.2d\n\t" + "add v23.2d, v25.2d, v27.2d\n\t" + "sha512h2 q27, q25, v24.2d\n\t" + /* Round 21 */ + "sha512su0 v5.2d, v6.2d\n\t" + "ext v21.16b, v1.16b, v2.16b, #8\n\t" + "sha512su1 v5.2d, v4.2d, v21.2d\n\t" + "add v20.2d, v5.2d, v17.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v23.16b, v26.16b, #8\n\t" + "ext v22.16b, v24.16b, v23.16b, #8\n\t" + "add v26.2d, v26.2d, v20.2d\n\t" + "sha512h q26, q21, v22.2d\n\t" + "add v25.2d, v24.2d, v26.2d\n\t" + "sha512h2 q26, q24, v27.2d\n\t" + /* Round 22 */ + "sha512su0 v6.2d, v7.2d\n\t" + "ext v21.16b, v2.16b, v3.16b, #8\n\t" + "sha512su1 v6.2d, v5.2d, v21.2d\n\t" + "add v20.2d, v6.2d, v18.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v25.16b, v23.16b, #8\n\t" + "ext v22.16b, v27.16b, v25.16b, #8\n\t" + "add v23.2d, v23.2d, v20.2d\n\t" + "sha512h q23, q21, v22.2d\n\t" + "add v24.2d, v27.2d, v23.2d\n\t" + "sha512h2 q23, q27, v26.2d\n\t" + /* Round 23 */ + "sha512su0 v7.2d, v0.2d\n\t" + "ext v21.16b, v3.16b, v4.16b, #8\n\t" + "sha512su1 v7.2d, v6.2d, v21.2d\n\t" + "add v20.2d, v7.2d, v19.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v24.16b, v25.16b, #8\n\t" + "ext v22.16b, v26.16b, v24.16b, #8\n\t" + "add v25.2d, v25.2d, v20.2d\n\t" + "sha512h q25, q21, v22.2d\n\t" + "add v27.2d, v26.2d, v25.2d\n\t" + "sha512h2 q25, q26, v23.2d\n\t" + /* Load next 8 64-bit words of K */ + "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40\n\t" + /* Round 24 */ + "sha512su0 v0.2d, v1.2d\n\t" + "ext v21.16b, v4.16b, v5.16b, #8\n\t" + "sha512su1 v0.2d, v7.2d, v21.2d\n\t" + "add v20.2d, v0.2d, v16.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v27.16b, v24.16b, #8\n\t" + "ext v22.16b, v23.16b, v27.16b, #8\n\t" + "add v24.2d, v24.2d, v20.2d\n\t" + "sha512h q24, q21, v22.2d\n\t" + "add v26.2d, v23.2d, v24.2d\n\t" + "sha512h2 q24, q23, v25.2d\n\t" + /* Round 25 */ + "sha512su0 v1.2d, v2.2d\n\t" + "ext v21.16b, v5.16b, v6.16b, #8\n\t" + "sha512su1 v1.2d, v0.2d, v21.2d\n\t" + "add v20.2d, v1.2d, v17.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v26.16b, v27.16b, #8\n\t" + "ext v22.16b, v25.16b, v26.16b, #8\n\t" + "add v27.2d, v27.2d, v20.2d\n\t" + "sha512h q27, q21, v22.2d\n\t" + "add v23.2d, v25.2d, v27.2d\n\t" + "sha512h2 q27, q25, v24.2d\n\t" + /* Round 26 */ + "sha512su0 v2.2d, v3.2d\n\t" + "ext v21.16b, v6.16b, v7.16b, #8\n\t" + "sha512su1 v2.2d, v1.2d, v21.2d\n\t" + "add v20.2d, v2.2d, v18.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v23.16b, v26.16b, #8\n\t" + "ext v22.16b, v24.16b, v23.16b, #8\n\t" + "add v26.2d, v26.2d, v20.2d\n\t" + "sha512h q26, q21, v22.2d\n\t" + "add v25.2d, v24.2d, v26.2d\n\t" + "sha512h2 q26, q24, v27.2d\n\t" + /* Round 27 */ + "sha512su0 v3.2d, v4.2d\n\t" + "ext v21.16b, v7.16b, v0.16b, #8\n\t" + "sha512su1 v3.2d, v2.2d, v21.2d\n\t" + "add v20.2d, v3.2d, v19.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v25.16b, v23.16b, #8\n\t" + "ext v22.16b, v27.16b, v25.16b, #8\n\t" + "add v23.2d, v23.2d, v20.2d\n\t" + "sha512h q23, q21, v22.2d\n\t" + "add v24.2d, v27.2d, v23.2d\n\t" + "sha512h2 q23, q27, v26.2d\n\t" + /* Load next 8 64-bit words of K */ + "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40\n\t" + /* Round 28 */ + "sha512su0 v4.2d, v5.2d\n\t" + "ext v21.16b, v0.16b, v1.16b, #8\n\t" + "sha512su1 v4.2d, v3.2d, v21.2d\n\t" + "add v20.2d, v4.2d, v16.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v24.16b, v25.16b, #8\n\t" + "ext v22.16b, v26.16b, v24.16b, #8\n\t" + "add v25.2d, v25.2d, v20.2d\n\t" + "sha512h q25, q21, v22.2d\n\t" + "add v27.2d, v26.2d, v25.2d\n\t" + "sha512h2 q25, q26, v23.2d\n\t" + /* Round 29 */ + "sha512su0 v5.2d, v6.2d\n\t" + "ext v21.16b, v1.16b, v2.16b, #8\n\t" + "sha512su1 v5.2d, v4.2d, v21.2d\n\t" + "add v20.2d, v5.2d, v17.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v27.16b, v24.16b, #8\n\t" + "ext v22.16b, v23.16b, v27.16b, #8\n\t" + "add v24.2d, v24.2d, v20.2d\n\t" + "sha512h q24, q21, v22.2d\n\t" + "add v26.2d, v23.2d, v24.2d\n\t" + "sha512h2 q24, q23, v25.2d\n\t" + /* Round 30 */ + "sha512su0 v6.2d, v7.2d\n\t" + "ext v21.16b, v2.16b, v3.16b, #8\n\t" + "sha512su1 v6.2d, v5.2d, v21.2d\n\t" + "add v20.2d, v6.2d, v18.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v26.16b, v27.16b, #8\n\t" + "ext v22.16b, v25.16b, v26.16b, #8\n\t" + "add v27.2d, v27.2d, v20.2d\n\t" + "sha512h q27, q21, v22.2d\n\t" + "add v23.2d, v25.2d, v27.2d\n\t" + "sha512h2 q27, q25, v24.2d\n\t" + /* Round 31 */ + "sha512su0 v7.2d, v0.2d\n\t" + "ext v21.16b, v3.16b, v4.16b, #8\n\t" + "sha512su1 v7.2d, v6.2d, v21.2d\n\t" + "add v20.2d, v7.2d, v19.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v23.16b, v26.16b, #8\n\t" + "ext v22.16b, v24.16b, v23.16b, #8\n\t" + "add v26.2d, v26.2d, v20.2d\n\t" + "sha512h q26, q21, v22.2d\n\t" + "add v25.2d, v24.2d, v26.2d\n\t" + "sha512h2 q26, q24, v27.2d\n\t" + /* Load next 8 64-bit words of K */ + "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40\n\t" + /* Round 32 */ + "sha512su0 v0.2d, v1.2d\n\t" + "ext v21.16b, v4.16b, v5.16b, #8\n\t" + "sha512su1 v0.2d, v7.2d, v21.2d\n\t" + "add v20.2d, v0.2d, v16.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v25.16b, v23.16b, #8\n\t" + "ext v22.16b, v27.16b, v25.16b, #8\n\t" + "add v23.2d, v23.2d, v20.2d\n\t" + "sha512h q23, q21, v22.2d\n\t" + "add v24.2d, v27.2d, v23.2d\n\t" + "sha512h2 q23, q27, v26.2d\n\t" + /* Round 33 */ + "sha512su0 v1.2d, v2.2d\n\t" + "ext v21.16b, v5.16b, v6.16b, #8\n\t" + "sha512su1 v1.2d, v0.2d, v21.2d\n\t" + "add v20.2d, v1.2d, v17.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v24.16b, v25.16b, #8\n\t" + "ext v22.16b, v26.16b, v24.16b, #8\n\t" + "add v25.2d, v25.2d, v20.2d\n\t" + "sha512h q25, q21, v22.2d\n\t" + "add v27.2d, v26.2d, v25.2d\n\t" + "sha512h2 q25, q26, v23.2d\n\t" + /* Round 34 */ + "sha512su0 v2.2d, v3.2d\n\t" + "ext v21.16b, v6.16b, v7.16b, #8\n\t" + "sha512su1 v2.2d, v1.2d, v21.2d\n\t" + "add v20.2d, v2.2d, v18.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v27.16b, v24.16b, #8\n\t" + "ext v22.16b, v23.16b, v27.16b, #8\n\t" + "add v24.2d, v24.2d, v20.2d\n\t" + "sha512h q24, q21, v22.2d\n\t" + "add v26.2d, v23.2d, v24.2d\n\t" + "sha512h2 q24, q23, v25.2d\n\t" + /* Round 35 */ + "sha512su0 v3.2d, v4.2d\n\t" + "ext v21.16b, v7.16b, v0.16b, #8\n\t" + "sha512su1 v3.2d, v2.2d, v21.2d\n\t" + "add v20.2d, v3.2d, v19.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v26.16b, v27.16b, #8\n\t" + "ext v22.16b, v25.16b, v26.16b, #8\n\t" + "add v27.2d, v27.2d, v20.2d\n\t" + "sha512h q27, q21, v22.2d\n\t" + "add v23.2d, v25.2d, v27.2d\n\t" + "sha512h2 q27, q25, v24.2d\n\t" + /* Load next 8 64-bit words of K */ + "ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #0x40\n\t" + /* Round 36 */ + "sha512su0 v4.2d, v5.2d\n\t" + "ext v21.16b, v0.16b, v1.16b, #8\n\t" + "sha512su1 v4.2d, v3.2d, v21.2d\n\t" + "add v20.2d, v4.2d, v16.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v23.16b, v26.16b, #8\n\t" + "ext v22.16b, v24.16b, v23.16b, #8\n\t" + "add v26.2d, v26.2d, v20.2d\n\t" + "sha512h q26, q21, v22.2d\n\t" + "add v25.2d, v24.2d, v26.2d\n\t" + "sha512h2 q26, q24, v27.2d\n\t" + /* Round 37 */ + "sha512su0 v5.2d, v6.2d\n\t" + "ext v21.16b, v1.16b, v2.16b, #8\n\t" + "sha512su1 v5.2d, v4.2d, v21.2d\n\t" + "add v20.2d, v5.2d, v17.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v25.16b, v23.16b, #8\n\t" + "ext v22.16b, v27.16b, v25.16b, #8\n\t" + "add v23.2d, v23.2d, v20.2d\n\t" + "sha512h q23, q21, v22.2d\n\t" + "add v24.2d, v27.2d, v23.2d\n\t" + "sha512h2 q23, q27, v26.2d\n\t" + /* Round 38 */ + "sha512su0 v6.2d, v7.2d\n\t" + "ext v21.16b, v2.16b, v3.16b, #8\n\t" + "sha512su1 v6.2d, v5.2d, v21.2d\n\t" + "add v20.2d, v6.2d, v18.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v24.16b, v25.16b, #8\n\t" + "ext v22.16b, v26.16b, v24.16b, #8\n\t" + "add v25.2d, v25.2d, v20.2d\n\t" + "sha512h q25, q21, v22.2d\n\t" + "add v27.2d, v26.2d, v25.2d\n\t" + "sha512h2 q25, q26, v23.2d\n\t" + /* Round 39 */ + "sha512su0 v7.2d, v0.2d\n\t" + "ext v21.16b, v3.16b, v4.16b, #8\n\t" + "sha512su1 v7.2d, v6.2d, v21.2d\n\t" + "add v20.2d, v7.2d, v19.2d\n\t" + "ext v20.16b, v20.16b, v20.16b, #8\n\t" + "ext v21.16b, v27.16b, v24.16b, #8\n\t" + "ext v22.16b, v23.16b, v27.16b, #8\n\t" + "add v24.2d, v24.2d, v20.2d\n\t" + "sha512h q24, q21, v22.2d\n\t" + "add v26.2d, v23.2d, v24.2d\n\t" + "sha512h2 q24, q23, v25.2d\n\t" + "add v27.2d, v27.2d, v31.2d\n\t" + "add v26.2d, v26.2d, v30.2d\n\t" + "add v25.2d, v25.2d, v29.2d\n\t" + "add v24.2d, v24.2d, v28.2d\n\t" + "subs %w[len], %w[len], #0x80\n\t" + "bne L_sha512_len_crypto_begin_%=\n\t" + /* Store digest back */ + "st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [%x[sha512]]\n\t" + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8), [L_SHA512_transform_crypto_len_k] "S" (L_SHA512_transform_crypto_len_k) + : "memory", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA512 */ +#endif /* WOLFSSL_SHA512 */ #endif /* __aarch64__ */ -#endif /* WOLFSSL_ARMASM && WOLFSSL_SHA512 */ +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512.c b/wolfcrypt/src/port/arm/armv8-sha512.c index 956ee3c3c..c077263c4 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512.c +++ b/wolfcrypt/src/port/arm/armv8-sha512.c @@ -34,7 +34,6 @@ #include #include -#include #include #include @@ -72,32 +71,152 @@ static int InitSha512(wc_Sha512* sha512) return 0; } +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_224) +/** + * Initialize given wc_Sha512 structure with value specific to sha512/224. + * Note that sha512/224 has different initial hash value from sha512. + * The initial hash value consists of eight 64bit words. They are given + * in FIPS180-4. + */ +static int InitSha512_224(wc_Sha512* sha512) +{ + if (sha512 == NULL) + return BAD_FUNC_ARG; + + sha512->digest[0] = W64LIT(0x8c3d37c819544da2); + sha512->digest[1] = W64LIT(0x73e1996689dcd4d6); + sha512->digest[2] = W64LIT(0x1dfab7ae32ff9c82); + sha512->digest[3] = W64LIT(0x679dd514582f9fcf); + sha512->digest[4] = W64LIT(0x0f6d2b697bd44da8); + sha512->digest[5] = W64LIT(0x77e36f7304c48942); + sha512->digest[6] = W64LIT(0x3f9d85a86a1d36c8); + sha512->digest[7] = W64LIT(0x1112e6ad91d692a1); + + sha512->buffLen = 0; + sha512->loLen = 0; + sha512->hiLen = 0; +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + sha512->flags = 0; +#endif + + return 0; +} +#endif /* !WOLFSSL_NOSHA512_224 */ +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_256) +/** + * Initialize given wc_Sha512 structure with value specific to sha512/256. + * Note that sha512/256 has different initial hash value from sha512. + * The initial hash value consists of eight 64bit words. They are given + * in FIPS180-4. + */ +static int InitSha512_256(wc_Sha512* sha512) +{ + if (sha512 == NULL) + return BAD_FUNC_ARG; + + sha512->digest[0] = W64LIT(0x22312194fc2bf72c); + sha512->digest[1] = W64LIT(0x9f555fa3c84c64c2); + sha512->digest[2] = W64LIT(0x2393b86b6f53b151); + sha512->digest[3] = W64LIT(0x963877195940eabd); + sha512->digest[4] = W64LIT(0x96283ee2a88effe3); + sha512->digest[5] = W64LIT(0xbe5e1e2553863992); + sha512->digest[6] = W64LIT(0x2b0199fc2c85b8aa); + sha512->digest[7] = W64LIT(0x0eb72ddc81c52ca2); + + sha512->buffLen = 0; + sha512->loLen = 0; + sha512->hiLen = 0; +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) + sha512->flags = 0; +#endif + + return 0; +} +#endif /* !WOLFSSL_NOSHA512_256 */ +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + #endif /* WOLFSSL_SHA512 */ #ifdef WOLFSSL_SHA512 -int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) +#ifdef WOLFSSL_ARMASM +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512 + extern void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, + word32 len); + #define Transform_Sha512_Len Transform_Sha512_Len_neon +#else + extern void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data, + word32 len); + #define Transform_Sha512_Len Transform_Sha512_Len_crypto +#endif +#endif + +static int InitSha512_Family(wc_Sha512* sha512, void* heap, int devId, + enum wc_HashType type) { - int ret = 0; + int ret = 0; if (sha512 == NULL) return BAD_FUNC_ARG; sha512->heap = heap; - - ret = InitSha512(sha512); - if (ret != 0) - return ret; - #ifdef WOLFSSL_SMALL_STACK_CACHE sha512->W = NULL; #endif + if (type == WC_HASH_TYPE_SHA512) { + ret = InitSha512(sha512); + } +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_224) + else if (type == WC_HASH_TYPE_SHA512_224) { + ret = InitSha512_224(sha512); + } +#endif +#if !defined(WOLFSSL_NOSHA512_256) + else if (type == WC_HASH_TYPE_SHA512_256) { + ret = InitSha512_256(sha512); + } +#endif +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + else + ret = BAD_FUNC_ARG; + + if (ret != 0) + return ret; + (void)devId; return ret; } +int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) +{ + return InitSha512_Family(sha512, heap, devId, WC_HASH_TYPE_SHA512); +} + +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_224) +int wc_InitSha512_224_ex(wc_Sha512* sha512, void* heap, int devId) +{ + return InitSha512_Family(sha512, heap, devId, WC_HASH_TYPE_SHA512_224); +} +#endif /* !WOLFSSL_NOSHA512_224 */ +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_256) +int wc_InitSha512_256_ex(wc_Sha512* sha512, void* heap, int devId) +{ + return InitSha512_Family(sha512, heap, devId, WC_HASH_TYPE_SHA512_256); +} +#endif /* !WOLFSSL_NOSHA512_256 */ +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + #endif /* WOLFSSL_SHA512 */ #ifndef WOLFSSL_ARMASM @@ -297,6 +416,7 @@ static void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len return 0; } #undef DATA + #endif @@ -448,21 +568,72 @@ int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash) return 0; } -int wc_Sha512Final(wc_Sha512* sha512, byte* hash) +static int Sha512_Family_Final(wc_Sha512* sha512, byte* hash, + enum wc_HashType type) { int ret; + int digestSz; + int (*initfp)(wc_Sha512*); + + (void)initfp; if (sha512 == NULL || hash == NULL) { return BAD_FUNC_ARG; } + if (type == WC_HASH_TYPE_SHA512) { + initfp = InitSha512; + digestSz = WC_SHA512_DIGEST_SIZE; + } +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_224) + else if (type == WC_HASH_TYPE_SHA512_224) { + initfp = InitSha512_224; + digestSz = WC_SHA512_224_DIGEST_SIZE; + } +#endif +#if !defined(WOLFSSL_NOSHA512_256) + else if (type == WC_HASH_TYPE_SHA512_256) { + initfp = InitSha512_256; + digestSz = WC_SHA512_256_DIGEST_SIZE; + } +#endif +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + else + return BAD_FUNC_ARG; + +#ifdef WOLF_CRYPTO_CB + if (sha512->devId != INVALID_DEVID) { + ret = wc_CryptoCb_Sha512Hash(sha512, NULL, 0, hash); + if (ret != CRYPTOCB_UNAVAILABLE) + return ret; + /* fall-through when unavailable */ + } +#endif +#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512) + if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) { + #if defined(HAVE_INTEL_QA) + return IntelQaSymSha512(&sha512->asyncDev, hash, NULL, digestSz); + #endif + } +#endif /* WOLFSSL_ASYNC_CRYPT */ + ret = Sha512Final(sha512); if (ret != 0) return ret; - XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE); + XMEMCPY(hash, sha512->digest, digestSz); - return InitSha512(sha512); /* reset state */ + /* initialize Sha512 structure for the next use */ + if (initfp != NULL) { + ret = initfp(sha512); + } + return ret; +} + +int wc_Sha512Final(wc_Sha512* sha512, byte* hash) +{ + return Sha512_Family_Final(sha512, hash, WC_HASH_TYPE_SHA512); } int wc_InitSha512(wc_Sha512* sha512) @@ -618,6 +789,42 @@ void wc_Sha384Free(wc_Sha384* sha384) #ifdef WOLFSSL_SHA512 +static int Sha512_Family_GetHash(wc_Sha512* sha512, byte* hash, + enum wc_HashType type ) +{ + int (*finalfp)(wc_Sha512*, byte*); + int ret; + wc_Sha512 tmpSha512; + + if (sha512 == NULL || hash == NULL) + return BAD_FUNC_ARG; + + if (type == WC_HASH_TYPE_SHA512) + finalfp = wc_Sha512Final; +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) +#if !defined(WOLFSSL_NOSHA512_224) + else if (type == WC_HASH_TYPE_SHA512_224) + finalfp = wc_Sha512_224Final; +#endif +#if !defined(WOLFSSL_NOSHA512_256) + else if (type == WC_HASH_TYPE_SHA512_256) + finalfp = wc_Sha512_256Final; +#endif +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + else + finalfp = NULL; + + if (finalfp == NULL) + return BAD_FUNC_ARG; + + ret = wc_Sha512Copy(sha512, &tmpSha512); + if (ret == 0) { + ret = finalfp(&tmpSha512, hash); + wc_Sha512Free(&tmpSha512); + } + return ret; +} + int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash) { int ret; @@ -670,6 +877,109 @@ int wc_Sha512GetFlags(wc_Sha512* sha512, word32* flags) } #endif +#if !defined(HAVE_FIPS) && !defined(HAVE_SELFTEST) + +#if !defined(WOLFSSL_NOSHA512_224) +int wc_InitSha512_224(wc_Sha512* sha) +{ + return wc_InitSha512_224_ex(sha, NULL, INVALID_DEVID); +} +int wc_Sha512_224Update(wc_Sha512* sha, const byte* data, word32 len) +{ + return wc_Sha512Update(sha, data, len); +} +int wc_Sha512_224FinalRaw(wc_Sha512* sha, byte* hash) +{ + return wc_Sha512FinalRaw(sha, hash); +} +int wc_Sha512_224Final(wc_Sha512* sha512, byte* hash) +{ + return Sha512_Family_Final(sha512, hash, WC_HASH_TYPE_SHA512_224); +} +void wc_Sha512_224Free(wc_Sha512* sha) +{ + wc_Sha512Free(sha); +} +int wc_Sha512_224GetHash(wc_Sha512* sha512, byte* hash) +{ + return Sha512_Family_GetHash(sha512, hash, WC_HASH_TYPE_SHA512_224); +} +int wc_Sha512_224Copy(wc_Sha512* src, wc_Sha512* dst) +{ + return wc_Sha512Copy(src, dst); +} + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) +int wc_Sha512_224SetFlags(wc_Sha512* sha, word32 flags) +{ + return wc_Sha512SetFlags(sha, flags); +} +int wc_Sha512_224GetFlags(wc_Sha512* sha, word32* flags) +{ + return wc_Sha512GetFlags(sha, flags); +} +#endif /* WOLFSSL_HASH_FLAGS || WOLF_CRYPTO_CB */ + +#if defined(OPENSSL_EXTRA) +int wc_Sha512_224Transform(wc_Sha512* sha, const unsigned char* data) +{ + return wc_Sha512Transform(sha, data); +} +#endif /* OPENSSL_EXTRA */ + +#endif /* !WOLFSSL_NOSHA512_224 */ + +#if !defined(WOLFSSL_NOSHA512_256) +int wc_InitSha512_256(wc_Sha512* sha) +{ + return wc_InitSha512_256_ex(sha, NULL, INVALID_DEVID); +} +int wc_Sha512_256Update(wc_Sha512* sha, const byte* data, word32 len) +{ + return wc_Sha512Update(sha, data, len); +} +int wc_Sha512_256FinalRaw(wc_Sha512* sha, byte* hash) +{ + return wc_Sha512FinalRaw(sha, hash); +} +int wc_Sha512_256Final(wc_Sha512* sha512, byte* hash) +{ + return Sha512_Family_Final(sha512, hash, WC_HASH_TYPE_SHA512_256); +} +void wc_Sha512_256Free(wc_Sha512* sha) +{ + wc_Sha512Free(sha); +} +int wc_Sha512_256GetHash(wc_Sha512* sha512, byte* hash) +{ + return Sha512_Family_GetHash(sha512, hash, WC_HASH_TYPE_SHA512_256); +} +int wc_Sha512_256Copy(wc_Sha512* src, wc_Sha512* dst) +{ + return wc_Sha512Copy(src, dst); +} + +#if defined(WOLFSSL_HASH_FLAGS) || defined(WOLF_CRYPTO_CB) +int wc_Sha512_256SetFlags(wc_Sha512* sha, word32 flags) +{ + return wc_Sha512SetFlags(sha, flags); +} +int wc_Sha512_256GetFlags(wc_Sha512* sha, word32* flags) +{ + return wc_Sha512GetFlags(sha, flags); +} +#endif /* WOLFSSL_HASH_FLAGS || WOLF_CRYPTO_CB */ + +#if defined(OPENSSL_EXTRA) +int wc_Sha512_256Transform(wc_Sha512* sha, const unsigned char* data) +{ + return wc_Sha512Transform(sha, data); +} +#endif /* OPENSSL_EXTRA */ + +#endif /* !WOLFSSL_NOSHA512_224 */ +#endif /* !HAVE_FIPS && !HAVE_SELFTEST */ + #endif /* WOLFSSL_SHA512 */ #ifdef WOLFSSL_SHA384 diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 0247bbb8e..6785bf9ff 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -2824,6 +2824,15 @@ WOLFSSL_TEST_SUBROUTINE int sha512_test(void) ERROR_OUT(-2409, exit); if (XMEMCMP(hash, large_digest, WC_SHA512_DIGEST_SIZE) != 0) ERROR_OUT(-2410, exit); + + /* Unaligned memory access test */ + for (i = 1; i < 16; i++) { + ret = wc_Sha512Update(&sha, (byte*)large_input + i, + (word32)sizeof(large_input) - i); + if (ret != 0) + ERROR_OUT(-2411, exit); + ret = wc_Sha512Final(&sha, hash); + } } /* END LARGE HASH TEST */ exit: diff --git a/wolfssl/wolfcrypt/cpuid.h b/wolfssl/wolfcrypt/cpuid.h index 91a725b96..865b5a28b 100644 --- a/wolfssl/wolfcrypt/cpuid.h +++ b/wolfssl/wolfcrypt/cpuid.h @@ -35,6 +35,11 @@ #if (defined(WOLFSSL_X86_64_BUILD) || defined(USE_INTEL_SPEEDUP) || \ defined(WOLFSSL_AESNI) || defined(WOLFSSL_SP_X86_64_ASM)) && \ !defined(WOLFSSL_NO_ASM) + #define HAVE_CPUID + #define HAVE_CPUID_INTEL +#endif + +#ifdef HAVE_CPUID_INTEL #define CPUID_AVX1 0x0001 #define CPUID_AVX2 0x0002 @@ -54,6 +59,9 @@ #define IS_INTEL_ADX(f) ((f) & CPUID_ADX) #define IS_INTEL_MOVBE(f) ((f) & CPUID_MOVBE) +#endif + +#ifdef HAVE_CPUID void cpuid_set_flags(void); word32 cpuid_get_flags(void); diff --git a/wolfssl/wolfcrypt/sha512.h b/wolfssl/wolfcrypt/sha512.h index 7d9374923..6737ae9ba 100644 --- a/wolfssl/wolfcrypt/sha512.h +++ b/wolfssl/wolfcrypt/sha512.h @@ -186,11 +186,6 @@ struct wc_Sha512 { #endif /* HAVE_FIPS */ -#ifdef WOLFSSL_ARMASM -WOLFSSL_LOCAL void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, - word32 len); -#endif - #ifdef WOLFSSL_SHA512