From 805b0eb606026e445b75362aa3391fdd145f5173 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 18 May 2022 16:23:48 +1000 Subject: [PATCH 1/9] ARM ASM: ARMv7a with NEON instructions Change to build assembly code for ARMv7a with NEON instruction set. ./configure -host=armv7a --enable-armasm Added ARM32 SHA-256 NEON only implementation. --- configure.ac | 16 +- src/include.am | 16 +- wolfcrypt/src/aes.c | 6 +- wolfcrypt/src/port/arm/armv8-32-sha256-asm.S | 2504 +++++++++++++++++ .../src/port/arm/armv8-32-sha256-asm_c.c | 2499 ++++++++++++++++ wolfcrypt/src/port/arm/armv8-32-sha512-asm.S | 16 +- .../src/port/arm/armv8-32-sha512-asm_c.c | 18 +- wolfcrypt/src/port/arm/armv8-aes.c | 3 +- wolfcrypt/src/port/arm/armv8-sha256.c | 107 + 9 files changed, 5159 insertions(+), 26 deletions(-) create mode 100644 wolfcrypt/src/port/arm/armv8-32-sha256-asm.S create mode 100644 wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c diff --git a/configure.ac b/configure.ac index b1ea55171..c49e60fba 100644 --- a/configure.ac +++ b/configure.ac @@ -2061,6 +2061,7 @@ then esac # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" + ENABLED_ARMASM_CRYPTO=yes # Check for and set -mstrict-align compiler flag # Used to set assumption that Aarch64 systems will not handle @@ -2077,12 +2078,22 @@ then AM_CPPFLAGS="$AM_CPPFLAGS -mstrict-align" AC_MSG_NOTICE([64bit ARMv8, setting -mstrict-align]);; esac - AC_MSG_NOTICE([64bit ARMv8 found, setting mcpu to generic+crypto]);; + AC_MSG_NOTICE([64bit ARMv8 found, setting mcpu to generic+crypto]) + ;; + armv7a) + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon-vfpv3 -DWOLFSSL_ARMASM_NO_CRYPTO" + # Include options.h + AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" + ENABLED_ARMASM_CRYPTO=no + AC_MSG_NOTICE([32bit ARMv7-a found, setting mfpu to neon-vfpv3]) + ;; *) AM_CPPFLAGS="$AM_CPPFLAGS -mfpu=crypto-neon-fp-armv8" # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" - AC_MSG_NOTICE([32bit ARMv8 found, setting mfpu to crypto-neon-fp-armv8]);; + ENABLED_ARMASM_CRYPTO=yes + AC_MSG_NOTICE([32bit ARMv8 found, setting mfpu to crypto-neon-fp-armv8]) + ;; esac esac fi @@ -7998,6 +8009,7 @@ AM_CONDITIONAL([BUILD_AESGCM],[test "x$ENABLED_AESGCM" = "xyes" || test "x$ENABL AM_CONDITIONAL([BUILD_AESCCM],[test "x$ENABLED_AESCCM" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"]) AM_CONDITIONAL([BUILD_ARMASM],[test "x$ENABLED_ARMASM" = "xyes"]) AM_CONDITIONAL([BUILD_ARMASM_INLINE],[test "x$ENABLED_ARMASM_INLINE" = "xyes"]) +AM_CONDITIONAL([BUILD_ARMASM_CRYPTO],[test "x$ENABLED_ARMASM_CRYPTO" = "xyes"]) AM_CONDITIONAL([BUILD_XILINX],[test "x$ENABLED_XILINX" = "xyes"]) AM_CONDITIONAL([BUILD_AESNI],[test "x$ENABLED_AESNI" = "xyes"]) AM_CONDITIONAL([BUILD_INTELASM],[test "x$ENABLED_INTELASM" = "xyes"]) diff --git a/src/include.am b/src/include.am index ab12ae4e9..9a8b70a1b 100644 --- a/src/include.am +++ b/src/include.am @@ -187,7 +187,7 @@ endif if BUILD_AES src_libwolfssl_la_SOURCES += wolfcrypt/src/aes.c -if BUILD_ARMASM +if BUILD_ARMASM_CRYPTO src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c endif endif @@ -203,6 +203,11 @@ endif if BUILD_ARMASM src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c +if BUILD_ARMASM_INLINE +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +else +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm.S +endif else src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c if BUILD_INTELASM @@ -300,10 +305,15 @@ endif endif !BUILD_FIPS_CURRENT if !BUILD_FIPS_CURRENT +src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c if BUILD_ARMASM src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c +if BUILD_ARMASM_INLINE +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +else +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm.S +endif else -src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256_asm.S endif @@ -383,7 +393,7 @@ endif if !BUILD_FIPS_CURRENT if BUILD_AES src_libwolfssl_la_SOURCES += wolfcrypt/src/aes.c -if BUILD_ARMASM +if BUILD_ARMASM_CRYPTO src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c endif if BUILD_AFALG diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index cc1055d43..e8fa52d22 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -306,7 +306,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits #include #endif -#if !defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_ARMASM_NO_CRYPTO) #ifdef WOLFSSL_IMX6_CAAM_BLOB /* case of possibly not using hardware acceleration for AES but using key @@ -4601,7 +4601,7 @@ static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz) #endif -#ifdef WOLFSSL_ARMASM +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_CRYPTO) /* implementation is located in wolfcrypt/src/port/arm/armv8-aes.c */ #elif defined(WOLFSSL_AFALG) @@ -9933,7 +9933,7 @@ int wc_AesCcmCheckTagSize(int sz) return 0; } -#ifdef WOLFSSL_ARMASM +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_CRYPTO) /* implementation located in wolfcrypt/src/port/arm/armv8-aes.c */ #elif defined(HAVE_COLDFIRE_SEC) diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S new file mode 100644 index 000000000..6132aac4a --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S @@ -0,0 +1,2504 @@ +/* armv8-32-sha256-asm + * + * Copyright (C) 2006-2021 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha256.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S + */ + +#include + +#ifdef WOLFSSL_ARMASM +#ifndef __aarch64__ +#ifndef NO_SHA256 +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA256_transform_len_k, %object + .size L_SHA256_transform_len_k, 256 + .align 3 +L_SHA256_transform_len_k: + .word 0x428a2f98 + .word 0x71374491 + .word 0xb5c0fbcf + .word 0xe9b5dba5 + .word 0x3956c25b + .word 0x59f111f1 + .word 0x923f82a4 + .word 0xab1c5ed5 + .word 0xd807aa98 + .word 0x12835b01 + .word 0x243185be + .word 0x550c7dc3 + .word 0x72be5d74 + .word 0x80deb1fe + .word 0x9bdc06a7 + .word 0xc19bf174 + .word 0xe49b69c1 + .word 0xefbe4786 + .word 0xfc19dc6 + .word 0x240ca1cc + .word 0x2de92c6f + .word 0x4a7484aa + .word 0x5cb0a9dc + .word 0x76f988da + .word 0x983e5152 + .word 0xa831c66d + .word 0xb00327c8 + .word 0xbf597fc7 + .word 0xc6e00bf3 + .word 0xd5a79147 + .word 0x6ca6351 + .word 0x14292967 + .word 0x27b70a85 + .word 0x2e1b2138 + .word 0x4d2c6dfc + .word 0x53380d13 + .word 0x650a7354 + .word 0x766a0abb + .word 0x81c2c92e + .word 0x92722c85 + .word 0xa2bfe8a1 + .word 0xa81a664b + .word 0xc24b8b70 + .word 0xc76c51a3 + .word 0xd192e819 + .word 0xd6990624 + .word 0xf40e3585 + .word 0x106aa070 + .word 0x19a4c116 + .word 0x1e376c08 + .word 0x2748774c + .word 0x34b0bcb5 + .word 0x391c0cb3 + .word 0x4ed8aa4a + .word 0x5b9cca4f + .word 0x682e6ff3 + .word 0x748f82ee + .word 0x78a5636f + .word 0x84c87814 + .word 0x8cc70208 + .word 0x90befffa + .word 0xa4506ceb + .word 0xbef9a3f7 + .word 0xc67178f2 + .text + .align 2 + .globl Transform_Sha256_Len + .type Transform_Sha256_Len, %function +Transform_Sha256_Len: + push {r4, r5, r6, r7, r8, r9, r10, lr} + sub sp, sp, #0xc0 + adr r3, L_SHA256_transform_len_k + # Copy digest to add in at end + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + ldrd r8, r9, [r0, #24] + str r12, [sp, #64] + str lr, [sp, #68] + strd r4, r5, [sp, #72] + strd r6, r7, [sp, #80] + strd r8, r9, [sp, #88] + # Start of loop processing a block +L_SHA256_transform_len_begin: + # Load, Reverse and Store W - 64 bytes + ldr r12, [r1] + ldr lr, [r1, #4] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r1, #16] + ldrd r8, r9, [r1, #24] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str r12, [sp] + str lr, [sp, #4] + strd r4, r5, [sp, #8] + strd r6, r7, [sp, #16] + strd r8, r9, [sp, #24] + ldr r12, [r1, #32] + ldr lr, [r1, #36] + ldrd r4, r5, [r1, #40] + ldrd r6, r7, [r1, #48] + ldrd r8, r9, [r1, #56] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str r12, [sp, #32] + str lr, [sp, #36] + strd r4, r5, [sp, #40] + strd r6, r7, [sp, #48] + strd r8, r9, [sp, #56] + ldr r9, [r0, #4] + ldr r12, [r0, #8] + eor r9, r9, r12 + mov r10, #3 + # Start of 16 rounds +L_SHA256_transform_len_start: + # Round 0 + ldr lr, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r7, [r0, #28] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp] + ldr r4, [r3] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #12] + str r7, [r0, #28] + # Calc new W[0] + ldr r4, [sp, #56] + ldr r5, [sp, #36] + ldr r6, [sp, #4] + ldr r7, [sp] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp] + # Round 1 + ldr lr, [r0, #12] + ldr r4, [r0, #16] + ldr r5, [r0, #20] + ldr r7, [r0, #24] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #4] + ldr r4, [r3, #4] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #28] + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #8] + str r7, [r0, #24] + # Calc new W[1] + ldr r4, [sp, #60] + ldr r5, [sp, #40] + ldr r6, [sp, #8] + ldr r7, [sp, #4] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #4] + # Round 2 + ldr lr, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r0, #16] + ldr r7, [r0, #20] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #8] + ldr r4, [r3, #8] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r0] + ldr r6, [r0, #4] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #4] + str r7, [r0, #20] + # Calc new W[2] + ldr r4, [sp] + ldr r5, [sp, #44] + ldr r6, [sp, #12] + ldr r7, [sp, #8] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #8] + # Round 3 + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r7, [r0, #16] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #12] + ldr r4, [r3, #12] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #20] + ldr r4, [r0, #24] + ldr r5, [r0, #28] + ldr r6, [r0] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0] + str r7, [r0, #16] + # Calc new W[3] + ldr r4, [sp, #4] + ldr r5, [sp, #48] + ldr r6, [sp, #16] + ldr r7, [sp, #12] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #12] + # Round 4 + ldr lr, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r7, [r0, #12] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #16] + ldr r4, [r3, #16] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #28] + str r7, [r0, #12] + # Calc new W[4] + ldr r4, [sp, #8] + ldr r5, [sp, #52] + ldr r6, [sp, #20] + ldr r7, [sp, #16] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #16] + # Round 5 + ldr lr, [r0, #28] + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r7, [r0, #8] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #20] + ldr r4, [r3, #20] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #12] + ldr r4, [r0, #16] + ldr r5, [r0, #20] + ldr r6, [r0, #24] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #24] + str r7, [r0, #8] + # Calc new W[5] + ldr r4, [sp, #12] + ldr r5, [sp, #56] + ldr r6, [sp, #24] + ldr r7, [sp, #20] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #20] + # Round 6 + ldr lr, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r0] + ldr r7, [r0, #4] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #24] + ldr r4, [r3, #24] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #20] + str r7, [r0, #4] + # Calc new W[6] + ldr r4, [sp, #16] + ldr r5, [sp, #60] + ldr r6, [sp, #28] + ldr r7, [sp, #24] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #24] + # Round 7 + ldr lr, [r0, #20] + ldr r4, [r0, #24] + ldr r5, [r0, #28] + ldr r7, [r0] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #28] + ldr r4, [r3, #28] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r6, [r0, #16] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #16] + str r7, [r0] + # Calc new W[7] + ldr r4, [sp, #20] + ldr r5, [sp] + ldr r6, [sp, #32] + ldr r7, [sp, #28] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #28] + # Round 8 + ldr lr, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r7, [r0, #28] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #32] + ldr r4, [r3, #32] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #12] + str r7, [r0, #28] + # Calc new W[8] + ldr r4, [sp, #24] + ldr r5, [sp, #4] + ldr r6, [sp, #36] + ldr r7, [sp, #32] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #32] + # Round 9 + ldr lr, [r0, #12] + ldr r4, [r0, #16] + ldr r5, [r0, #20] + ldr r7, [r0, #24] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #36] + ldr r4, [r3, #36] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #28] + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #8] + str r7, [r0, #24] + # Calc new W[9] + ldr r4, [sp, #28] + ldr r5, [sp, #8] + ldr r6, [sp, #40] + ldr r7, [sp, #36] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #36] + # Round 10 + ldr lr, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r0, #16] + ldr r7, [r0, #20] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #40] + ldr r4, [r3, #40] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r0] + ldr r6, [r0, #4] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #4] + str r7, [r0, #20] + # Calc new W[10] + ldr r4, [sp, #32] + ldr r5, [sp, #12] + ldr r6, [sp, #44] + ldr r7, [sp, #40] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #40] + # Round 11 + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r7, [r0, #16] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #44] + ldr r4, [r3, #44] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #20] + ldr r4, [r0, #24] + ldr r5, [r0, #28] + ldr r6, [r0] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0] + str r7, [r0, #16] + # Calc new W[11] + ldr r4, [sp, #36] + ldr r5, [sp, #16] + ldr r6, [sp, #48] + ldr r7, [sp, #44] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #44] + # Round 12 + ldr lr, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r7, [r0, #12] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #48] + ldr r4, [r3, #48] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #28] + str r7, [r0, #12] + # Calc new W[12] + ldr r4, [sp, #40] + ldr r5, [sp, #20] + ldr r6, [sp, #52] + ldr r7, [sp, #48] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #48] + # Round 13 + ldr lr, [r0, #28] + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r7, [r0, #8] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #52] + ldr r4, [r3, #52] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #12] + ldr r4, [r0, #16] + ldr r5, [r0, #20] + ldr r6, [r0, #24] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #24] + str r7, [r0, #8] + # Calc new W[13] + ldr r4, [sp, #44] + ldr r5, [sp, #24] + ldr r6, [sp, #56] + ldr r7, [sp, #52] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #52] + # Round 14 + ldr lr, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r0] + ldr r7, [r0, #4] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #56] + ldr r4, [r3, #56] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #20] + str r7, [r0, #4] + # Calc new W[14] + ldr r4, [sp, #48] + ldr r5, [sp, #28] + ldr r6, [sp, #60] + ldr r7, [sp, #56] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #56] + # Round 15 + ldr lr, [r0, #20] + ldr r4, [r0, #24] + ldr r5, [r0, #28] + ldr r7, [r0] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #60] + ldr r4, [r3, #60] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r6, [r0, #16] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #16] + str r7, [r0] + # Calc new W[15] + ldr r4, [sp, #52] + ldr r5, [sp, #32] + ldr r6, [sp] + ldr r7, [sp, #60] + ror r12, r4, #17 + ror lr, r6, #7 + eor r12, r12, r4, ror #19 + eor lr, lr, r6, ror #18 + eor r12, r12, r4, lsr #10 + eor lr, lr, r6, lsr #3 + add r7, r7, r5 + add r12, r12, lr + add r7, r7, r12 + str r7, [sp, #60] + add r3, r3, #0x40 + subs r10, r10, #1 + bne L_SHA256_transform_len_start + # Round 0 + ldr lr, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r7, [r0, #28] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp] + ldr r4, [r3] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #12] + str r7, [r0, #28] + # Round 1 + ldr lr, [r0, #12] + ldr r4, [r0, #16] + ldr r5, [r0, #20] + ldr r7, [r0, #24] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #4] + ldr r4, [r3, #4] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #28] + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #8] + str r7, [r0, #24] + # Round 2 + ldr lr, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r0, #16] + ldr r7, [r0, #20] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #8] + ldr r4, [r3, #8] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r0] + ldr r6, [r0, #4] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #4] + str r7, [r0, #20] + # Round 3 + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r7, [r0, #16] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #12] + ldr r4, [r3, #12] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #20] + ldr r4, [r0, #24] + ldr r5, [r0, #28] + ldr r6, [r0] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0] + str r7, [r0, #16] + # Round 4 + ldr lr, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r7, [r0, #12] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #16] + ldr r4, [r3, #16] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #28] + str r7, [r0, #12] + # Round 5 + ldr lr, [r0, #28] + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r7, [r0, #8] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #20] + ldr r4, [r3, #20] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #12] + ldr r4, [r0, #16] + ldr r5, [r0, #20] + ldr r6, [r0, #24] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #24] + str r7, [r0, #8] + # Round 6 + ldr lr, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r0] + ldr r7, [r0, #4] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #24] + ldr r4, [r3, #24] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #20] + str r7, [r0, #4] + # Round 7 + ldr lr, [r0, #20] + ldr r4, [r0, #24] + ldr r5, [r0, #28] + ldr r7, [r0] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #28] + ldr r4, [r3, #28] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r6, [r0, #16] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #16] + str r7, [r0] + # Round 8 + ldr lr, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r7, [r0, #28] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #32] + ldr r4, [r3, #32] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r6, [r0, #12] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #12] + str r7, [r0, #28] + # Round 9 + ldr lr, [r0, #12] + ldr r4, [r0, #16] + ldr r5, [r0, #20] + ldr r7, [r0, #24] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #36] + ldr r4, [r3, #36] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #28] + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #8] + str r7, [r0, #24] + # Round 10 + ldr lr, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r0, #16] + ldr r7, [r0, #20] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #40] + ldr r4, [r3, #40] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r0] + ldr r6, [r0, #4] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #4] + str r7, [r0, #20] + # Round 11 + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r7, [r0, #16] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #44] + ldr r4, [r3, #44] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #20] + ldr r4, [r0, #24] + ldr r5, [r0, #28] + ldr r6, [r0] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0] + str r7, [r0, #16] + # Round 12 + ldr lr, [r0] + ldr r4, [r0, #4] + ldr r5, [r0, #8] + ldr r7, [r0, #12] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #48] + ldr r4, [r3, #48] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #16] + ldr r4, [r0, #20] + ldr r5, [r0, #24] + ldr r6, [r0, #28] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #28] + str r7, [r0, #12] + # Round 13 + ldr lr, [r0, #28] + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r7, [r0, #8] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #52] + ldr r4, [r3, #52] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #12] + ldr r4, [r0, #16] + ldr r5, [r0, #20] + ldr r6, [r0, #24] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #24] + str r7, [r0, #8] + # Round 14 + ldr lr, [r0, #24] + ldr r4, [r0, #28] + ldr r5, [r0] + ldr r7, [r0, #4] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #56] + ldr r4, [r3, #56] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #8] + ldr r4, [r0, #12] + ldr r5, [r0, #16] + ldr r6, [r0, #20] + ror r12, lr, #2 + eor r8, lr, r4 + eor r12, r12, lr, ror #13 + and r9, r9, r8 + eor r12, r12, lr, ror #22 + eor r9, r9, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r9 + str r6, [r0, #20] + str r7, [r0, #4] + # Round 15 + ldr lr, [r0, #20] + ldr r4, [r0, #24] + ldr r5, [r0, #28] + ldr r7, [r0] + ror r12, lr, #6 + eor r4, r4, r5 + eor r12, r12, lr, ror #11 + and r4, r4, lr + eor r12, r12, lr, ror #25 + eor r4, r4, r5 + add r7, r7, r12 + add r7, r7, r4 + ldr lr, [sp, #60] + ldr r4, [r3, #60] + add r7, r7, lr + add r7, r7, r4 + ldr lr, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r6, [r0, #16] + ror r12, lr, #2 + eor r9, lr, r4 + eor r12, r12, lr, ror #13 + and r8, r8, r9 + eor r12, r12, lr, ror #22 + eor r8, r8, r4 + add r6, r6, r7 + add r7, r7, r12 + add r7, r7, r8 + str r6, [r0, #16] + str r7, [r0] + # Add in digest from start + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [sp, #64] + ldrd r8, r9, [sp, #72] + add r12, r12, r6 + add lr, lr, r7 + add r4, r4, r8 + add r5, r5, r9 + str r12, [r0] + str lr, [r0, #4] + strd r4, r5, [r0, #8] + str r12, [sp, #64] + str lr, [sp, #68] + strd r4, r5, [sp, #72] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [sp, #80] + ldrd r8, r9, [sp, #88] + add r12, r12, r6 + add lr, lr, r7 + add r4, r4, r8 + add r5, r5, r9 + str r12, [r0, #16] + str lr, [r0, #20] + strd r4, r5, [r0, #24] + str r12, [sp, #80] + str lr, [sp, #84] + strd r4, r5, [sp, #88] + subs r2, r2, #0x40 + sub r3, r3, #0xc0 + add r1, r1, #0x40 + bne L_SHA256_transform_len_begin + add sp, sp, #0xc0 + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size Transform_Sha256_Len,.-Transform_Sha256_Len +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#ifndef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA256_transform_neon_len_k, %object + .size L_SHA256_transform_neon_len_k, 256 + .align 3 +L_SHA256_transform_neon_len_k: + .word 0x428a2f98 + .word 0x71374491 + .word 0xb5c0fbcf + .word 0xe9b5dba5 + .word 0x3956c25b + .word 0x59f111f1 + .word 0x923f82a4 + .word 0xab1c5ed5 + .word 0xd807aa98 + .word 0x12835b01 + .word 0x243185be + .word 0x550c7dc3 + .word 0x72be5d74 + .word 0x80deb1fe + .word 0x9bdc06a7 + .word 0xc19bf174 + .word 0xe49b69c1 + .word 0xefbe4786 + .word 0xfc19dc6 + .word 0x240ca1cc + .word 0x2de92c6f + .word 0x4a7484aa + .word 0x5cb0a9dc + .word 0x76f988da + .word 0x983e5152 + .word 0xa831c66d + .word 0xb00327c8 + .word 0xbf597fc7 + .word 0xc6e00bf3 + .word 0xd5a79147 + .word 0x6ca6351 + .word 0x14292967 + .word 0x27b70a85 + .word 0x2e1b2138 + .word 0x4d2c6dfc + .word 0x53380d13 + .word 0x650a7354 + .word 0x766a0abb + .word 0x81c2c92e + .word 0x92722c85 + .word 0xa2bfe8a1 + .word 0xa81a664b + .word 0xc24b8b70 + .word 0xc76c51a3 + .word 0xd192e819 + .word 0xd6990624 + .word 0xf40e3585 + .word 0x106aa070 + .word 0x19a4c116 + .word 0x1e376c08 + .word 0x2748774c + .word 0x34b0bcb5 + .word 0x391c0cb3 + .word 0x4ed8aa4a + .word 0x5b9cca4f + .word 0x682e6ff3 + .word 0x748f82ee + .word 0x78a5636f + .word 0x84c87814 + .word 0x8cc70208 + .word 0x90befffa + .word 0xa4506ceb + .word 0xbef9a3f7 + .word 0xc67178f2 + .text + .align 2 + .globl Transform_Sha256_Len + .type Transform_Sha256_Len, %function +Transform_Sha256_Len: + push {r4, r5, r6, r7, r8, r9, r10, lr} + vpush {d8-d11} + sub sp, sp, #24 + strd r0, r1, [sp] + str r2, [sp, #8] + adr r12, L_SHA256_transform_neon_len_k + # Load digest into registers + ldrd r2, r3, [r0] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + ldrd r8, r9, [r0, #24] + # Start of loop processing a block +L_SHA256_transform_neon_len_begin: + # Load W + vldm.32 r1!, {d0-d7} + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 + str r1, [sp, #4] + mov lr, #3 + # Start of 16 rounds +L_SHA256_transform_neon_len_start: + # Round 0 + vmov r10, d0[0] + ror r0, r6, #6 + eor r1, r7, r8 + eor r0, r0, r6, ror #11 + and r1, r1, r6 + eor r0, r0, r6, ror #25 + eor r1, r1, r8 + add r9, r9, r0 + add r9, r9, r1 + ldr r0, [r12] + add r9, r9, r10 + add r9, r9, r0 + add r5, r5, r9 + ror r0, r2, #2 + eor r1, r2, r3 + eor r0, r0, r2, ror #13 + eor r10, r3, r4 + and r1, r1, r10 + eor r0, r0, r2, ror #22 + eor r1, r1, r3 + add r9, r9, r0 + add r9, r9, r1 + # Round 1 + vmov r10, d0[1] + # Calc new W[0]-W[1] + vext.8 d10, d0, d1, #4 + ror r0, r5, #6 + vshl.u32 d8, d7, #15 + eor r1, r6, r7 + vsri.u32 d8, d7, #17 + eor r0, r0, r5, ror #11 + vshl.u32 d9, d7, #13 + and r1, r1, r5 + vsri.u32 d9, d7, #19 + eor r0, r0, r5, ror #25 + veor d9, d8 + eor r1, r1, r7 + vshr.u32 d8, d7, #10 + add r8, r8, r0 + veor d9, d8 + add r8, r8, r1 + vadd.i32 d0, d9 + ldr r0, [r12, #4] + vext.8 d11, d4, d5, #4 + add r8, r8, r10 + vadd.i32 d0, d11 + add r8, r8, r0 + vshl.u32 d8, d10, #25 + add r4, r4, r8 + vsri.u32 d8, d10, #7 + ror r0, r9, #2 + vshl.u32 d9, d10, #14 + eor r1, r9, r2 + vsri.u32 d9, d10, #18 + eor r0, r0, r9, ror #13 + veor d9, d8 + eor r10, r2, r3 + vshr.u32 d10, #3 + and r1, r1, r10 + veor d9, d10 + eor r0, r0, r9, ror #22 + vadd.i32 d0, d9 + eor r1, r1, r2 + add r8, r8, r0 + add r8, r8, r1 + # Round 2 + vmov r10, d1[0] + ror r0, r4, #6 + eor r1, r5, r6 + eor r0, r0, r4, ror #11 + and r1, r1, r4 + eor r0, r0, r4, ror #25 + eor r1, r1, r6 + add r7, r7, r0 + add r7, r7, r1 + ldr r0, [r12, #8] + add r7, r7, r10 + add r7, r7, r0 + add r3, r3, r7 + ror r0, r8, #2 + eor r1, r8, r9 + eor r0, r0, r8, ror #13 + eor r10, r9, r2 + and r1, r1, r10 + eor r0, r0, r8, ror #22 + eor r1, r1, r9 + add r7, r7, r0 + add r7, r7, r1 + # Round 3 + vmov r10, d1[1] + # Calc new W[2]-W[3] + vext.8 d10, d1, d2, #4 + ror r0, r3, #6 + vshl.u32 d8, d0, #15 + eor r1, r4, r5 + vsri.u32 d8, d0, #17 + eor r0, r0, r3, ror #11 + vshl.u32 d9, d0, #13 + and r1, r1, r3 + vsri.u32 d9, d0, #19 + eor r0, r0, r3, ror #25 + veor d9, d8 + eor r1, r1, r5 + vshr.u32 d8, d0, #10 + add r6, r6, r0 + veor d9, d8 + add r6, r6, r1 + vadd.i32 d1, d9 + ldr r0, [r12, #12] + vext.8 d11, d5, d6, #4 + add r6, r6, r10 + vadd.i32 d1, d11 + add r6, r6, r0 + vshl.u32 d8, d10, #25 + add r2, r2, r6 + vsri.u32 d8, d10, #7 + ror r0, r7, #2 + vshl.u32 d9, d10, #14 + eor r1, r7, r8 + vsri.u32 d9, d10, #18 + eor r0, r0, r7, ror #13 + veor d9, d8 + eor r10, r8, r9 + vshr.u32 d10, #3 + and r1, r1, r10 + veor d9, d10 + eor r0, r0, r7, ror #22 + vadd.i32 d1, d9 + eor r1, r1, r8 + add r6, r6, r0 + add r6, r6, r1 + # Round 4 + vmov r10, d2[0] + ror r0, r2, #6 + eor r1, r3, r4 + eor r0, r0, r2, ror #11 + and r1, r1, r2 + eor r0, r0, r2, ror #25 + eor r1, r1, r4 + add r5, r5, r0 + add r5, r5, r1 + ldr r0, [r12, #16] + add r5, r5, r10 + add r5, r5, r0 + add r9, r9, r5 + ror r0, r6, #2 + eor r1, r6, r7 + eor r0, r0, r6, ror #13 + eor r10, r7, r8 + and r1, r1, r10 + eor r0, r0, r6, ror #22 + eor r1, r1, r7 + add r5, r5, r0 + add r5, r5, r1 + # Round 5 + vmov r10, d2[1] + # Calc new W[4]-W[5] + vext.8 d10, d2, d3, #4 + ror r0, r9, #6 + vshl.u32 d8, d1, #15 + eor r1, r2, r3 + vsri.u32 d8, d1, #17 + eor r0, r0, r9, ror #11 + vshl.u32 d9, d1, #13 + and r1, r1, r9 + vsri.u32 d9, d1, #19 + eor r0, r0, r9, ror #25 + veor d9, d8 + eor r1, r1, r3 + vshr.u32 d8, d1, #10 + add r4, r4, r0 + veor d9, d8 + add r4, r4, r1 + vadd.i32 d2, d9 + ldr r0, [r12, #20] + vext.8 d11, d6, d7, #4 + add r4, r4, r10 + vadd.i32 d2, d11 + add r4, r4, r0 + vshl.u32 d8, d10, #25 + add r8, r8, r4 + vsri.u32 d8, d10, #7 + ror r0, r5, #2 + vshl.u32 d9, d10, #14 + eor r1, r5, r6 + vsri.u32 d9, d10, #18 + eor r0, r0, r5, ror #13 + veor d9, d8 + eor r10, r6, r7 + vshr.u32 d10, #3 + and r1, r1, r10 + veor d9, d10 + eor r0, r0, r5, ror #22 + vadd.i32 d2, d9 + eor r1, r1, r6 + add r4, r4, r0 + add r4, r4, r1 + # Round 6 + vmov r10, d3[0] + ror r0, r8, #6 + eor r1, r9, r2 + eor r0, r0, r8, ror #11 + and r1, r1, r8 + eor r0, r0, r8, ror #25 + eor r1, r1, r2 + add r3, r3, r0 + add r3, r3, r1 + ldr r0, [r12, #24] + add r3, r3, r10 + add r3, r3, r0 + add r7, r7, r3 + ror r0, r4, #2 + eor r1, r4, r5 + eor r0, r0, r4, ror #13 + eor r10, r5, r6 + and r1, r1, r10 + eor r0, r0, r4, ror #22 + eor r1, r1, r5 + add r3, r3, r0 + add r3, r3, r1 + # Round 7 + vmov r10, d3[1] + # Calc new W[6]-W[7] + vext.8 d10, d3, d4, #4 + ror r0, r7, #6 + vshl.u32 d8, d2, #15 + eor r1, r8, r9 + vsri.u32 d8, d2, #17 + eor r0, r0, r7, ror #11 + vshl.u32 d9, d2, #13 + and r1, r1, r7 + vsri.u32 d9, d2, #19 + eor r0, r0, r7, ror #25 + veor d9, d8 + eor r1, r1, r9 + vshr.u32 d8, d2, #10 + add r2, r2, r0 + veor d9, d8 + add r2, r2, r1 + vadd.i32 d3, d9 + ldr r0, [r12, #28] + vext.8 d11, d7, d0, #4 + add r2, r2, r10 + vadd.i32 d3, d11 + add r2, r2, r0 + vshl.u32 d8, d10, #25 + add r6, r6, r2 + vsri.u32 d8, d10, #7 + ror r0, r3, #2 + vshl.u32 d9, d10, #14 + eor r1, r3, r4 + vsri.u32 d9, d10, #18 + eor r0, r0, r3, ror #13 + veor d9, d8 + eor r10, r4, r5 + vshr.u32 d10, #3 + and r1, r1, r10 + veor d9, d10 + eor r0, r0, r3, ror #22 + vadd.i32 d3, d9 + eor r1, r1, r4 + add r2, r2, r0 + add r2, r2, r1 + # Round 8 + vmov r10, d4[0] + ror r0, r6, #6 + eor r1, r7, r8 + eor r0, r0, r6, ror #11 + and r1, r1, r6 + eor r0, r0, r6, ror #25 + eor r1, r1, r8 + add r9, r9, r0 + add r9, r9, r1 + ldr r0, [r12, #32] + add r9, r9, r10 + add r9, r9, r0 + add r5, r5, r9 + ror r0, r2, #2 + eor r1, r2, r3 + eor r0, r0, r2, ror #13 + eor r10, r3, r4 + and r1, r1, r10 + eor r0, r0, r2, ror #22 + eor r1, r1, r3 + add r9, r9, r0 + add r9, r9, r1 + # Round 9 + vmov r10, d4[1] + # Calc new W[8]-W[9] + vext.8 d10, d4, d5, #4 + ror r0, r5, #6 + vshl.u32 d8, d3, #15 + eor r1, r6, r7 + vsri.u32 d8, d3, #17 + eor r0, r0, r5, ror #11 + vshl.u32 d9, d3, #13 + and r1, r1, r5 + vsri.u32 d9, d3, #19 + eor r0, r0, r5, ror #25 + veor d9, d8 + eor r1, r1, r7 + vshr.u32 d8, d3, #10 + add r8, r8, r0 + veor d9, d8 + add r8, r8, r1 + vadd.i32 d4, d9 + ldr r0, [r12, #36] + vext.8 d11, d0, d1, #4 + add r8, r8, r10 + vadd.i32 d4, d11 + add r8, r8, r0 + vshl.u32 d8, d10, #25 + add r4, r4, r8 + vsri.u32 d8, d10, #7 + ror r0, r9, #2 + vshl.u32 d9, d10, #14 + eor r1, r9, r2 + vsri.u32 d9, d10, #18 + eor r0, r0, r9, ror #13 + veor d9, d8 + eor r10, r2, r3 + vshr.u32 d10, #3 + and r1, r1, r10 + veor d9, d10 + eor r0, r0, r9, ror #22 + vadd.i32 d4, d9 + eor r1, r1, r2 + add r8, r8, r0 + add r8, r8, r1 + # Round 10 + vmov r10, d5[0] + ror r0, r4, #6 + eor r1, r5, r6 + eor r0, r0, r4, ror #11 + and r1, r1, r4 + eor r0, r0, r4, ror #25 + eor r1, r1, r6 + add r7, r7, r0 + add r7, r7, r1 + ldr r0, [r12, #40] + add r7, r7, r10 + add r7, r7, r0 + add r3, r3, r7 + ror r0, r8, #2 + eor r1, r8, r9 + eor r0, r0, r8, ror #13 + eor r10, r9, r2 + and r1, r1, r10 + eor r0, r0, r8, ror #22 + eor r1, r1, r9 + add r7, r7, r0 + add r7, r7, r1 + # Round 11 + vmov r10, d5[1] + # Calc new W[10]-W[11] + vext.8 d10, d5, d6, #4 + ror r0, r3, #6 + vshl.u32 d8, d4, #15 + eor r1, r4, r5 + vsri.u32 d8, d4, #17 + eor r0, r0, r3, ror #11 + vshl.u32 d9, d4, #13 + and r1, r1, r3 + vsri.u32 d9, d4, #19 + eor r0, r0, r3, ror #25 + veor d9, d8 + eor r1, r1, r5 + vshr.u32 d8, d4, #10 + add r6, r6, r0 + veor d9, d8 + add r6, r6, r1 + vadd.i32 d5, d9 + ldr r0, [r12, #44] + vext.8 d11, d1, d2, #4 + add r6, r6, r10 + vadd.i32 d5, d11 + add r6, r6, r0 + vshl.u32 d8, d10, #25 + add r2, r2, r6 + vsri.u32 d8, d10, #7 + ror r0, r7, #2 + vshl.u32 d9, d10, #14 + eor r1, r7, r8 + vsri.u32 d9, d10, #18 + eor r0, r0, r7, ror #13 + veor d9, d8 + eor r10, r8, r9 + vshr.u32 d10, #3 + and r1, r1, r10 + veor d9, d10 + eor r0, r0, r7, ror #22 + vadd.i32 d5, d9 + eor r1, r1, r8 + add r6, r6, r0 + add r6, r6, r1 + # Round 12 + vmov r10, d6[0] + ror r0, r2, #6 + eor r1, r3, r4 + eor r0, r0, r2, ror #11 + and r1, r1, r2 + eor r0, r0, r2, ror #25 + eor r1, r1, r4 + add r5, r5, r0 + add r5, r5, r1 + ldr r0, [r12, #48] + add r5, r5, r10 + add r5, r5, r0 + add r9, r9, r5 + ror r0, r6, #2 + eor r1, r6, r7 + eor r0, r0, r6, ror #13 + eor r10, r7, r8 + and r1, r1, r10 + eor r0, r0, r6, ror #22 + eor r1, r1, r7 + add r5, r5, r0 + add r5, r5, r1 + # Round 13 + vmov r10, d6[1] + # Calc new W[12]-W[13] + vext.8 d10, d6, d7, #4 + ror r0, r9, #6 + vshl.u32 d8, d5, #15 + eor r1, r2, r3 + vsri.u32 d8, d5, #17 + eor r0, r0, r9, ror #11 + vshl.u32 d9, d5, #13 + and r1, r1, r9 + vsri.u32 d9, d5, #19 + eor r0, r0, r9, ror #25 + veor d9, d8 + eor r1, r1, r3 + vshr.u32 d8, d5, #10 + add r4, r4, r0 + veor d9, d8 + add r4, r4, r1 + vadd.i32 d6, d9 + ldr r0, [r12, #52] + vext.8 d11, d2, d3, #4 + add r4, r4, r10 + vadd.i32 d6, d11 + add r4, r4, r0 + vshl.u32 d8, d10, #25 + add r8, r8, r4 + vsri.u32 d8, d10, #7 + ror r0, r5, #2 + vshl.u32 d9, d10, #14 + eor r1, r5, r6 + vsri.u32 d9, d10, #18 + eor r0, r0, r5, ror #13 + veor d9, d8 + eor r10, r6, r7 + vshr.u32 d10, #3 + and r1, r1, r10 + veor d9, d10 + eor r0, r0, r5, ror #22 + vadd.i32 d6, d9 + eor r1, r1, r6 + add r4, r4, r0 + add r4, r4, r1 + # Round 14 + vmov r10, d7[0] + ror r0, r8, #6 + eor r1, r9, r2 + eor r0, r0, r8, ror #11 + and r1, r1, r8 + eor r0, r0, r8, ror #25 + eor r1, r1, r2 + add r3, r3, r0 + add r3, r3, r1 + ldr r0, [r12, #56] + add r3, r3, r10 + add r3, r3, r0 + add r7, r7, r3 + ror r0, r4, #2 + eor r1, r4, r5 + eor r0, r0, r4, ror #13 + eor r10, r5, r6 + and r1, r1, r10 + eor r0, r0, r4, ror #22 + eor r1, r1, r5 + add r3, r3, r0 + add r3, r3, r1 + # Round 15 + vmov r10, d7[1] + # Calc new W[14]-W[15] + vext.8 d10, d7, d0, #4 + ror r0, r7, #6 + vshl.u32 d8, d6, #15 + eor r1, r8, r9 + vsri.u32 d8, d6, #17 + eor r0, r0, r7, ror #11 + vshl.u32 d9, d6, #13 + and r1, r1, r7 + vsri.u32 d9, d6, #19 + eor r0, r0, r7, ror #25 + veor d9, d8 + eor r1, r1, r9 + vshr.u32 d8, d6, #10 + add r2, r2, r0 + veor d9, d8 + add r2, r2, r1 + vadd.i32 d7, d9 + ldr r0, [r12, #60] + vext.8 d11, d3, d4, #4 + add r2, r2, r10 + vadd.i32 d7, d11 + add r2, r2, r0 + vshl.u32 d8, d10, #25 + add r6, r6, r2 + vsri.u32 d8, d10, #7 + ror r0, r3, #2 + vshl.u32 d9, d10, #14 + eor r1, r3, r4 + vsri.u32 d9, d10, #18 + eor r0, r0, r3, ror #13 + veor d9, d8 + eor r10, r4, r5 + vshr.u32 d10, #3 + and r1, r1, r10 + veor d9, d10 + eor r0, r0, r3, ror #22 + vadd.i32 d7, d9 + eor r1, r1, r4 + add r2, r2, r0 + add r2, r2, r1 + add r12, r12, #0x40 + subs lr, lr, #1 + bne L_SHA256_transform_neon_len_start + # Round 0 + vmov r10, d0[0] + ror r0, r6, #6 + eor r1, r7, r8 + eor r0, r0, r6, ror #11 + and r1, r1, r6 + eor r0, r0, r6, ror #25 + eor r1, r1, r8 + add r9, r9, r0 + add r9, r9, r1 + ldr r0, [r12] + add r9, r9, r10 + add r9, r9, r0 + add r5, r5, r9 + ror r0, r2, #2 + eor r1, r2, r3 + eor r0, r0, r2, ror #13 + eor r10, r3, r4 + and r1, r1, r10 + eor r0, r0, r2, ror #22 + eor r1, r1, r3 + add r9, r9, r0 + add r9, r9, r1 + # Round 1 + vmov r10, d0[1] + ror r0, r5, #6 + eor r1, r6, r7 + eor r0, r0, r5, ror #11 + and r1, r1, r5 + eor r0, r0, r5, ror #25 + eor r1, r1, r7 + add r8, r8, r0 + add r8, r8, r1 + ldr r0, [r12, #4] + add r8, r8, r10 + add r8, r8, r0 + add r4, r4, r8 + ror r0, r9, #2 + eor r1, r9, r2 + eor r0, r0, r9, ror #13 + eor r10, r2, r3 + and r1, r1, r10 + eor r0, r0, r9, ror #22 + eor r1, r1, r2 + add r8, r8, r0 + add r8, r8, r1 + # Round 2 + vmov r10, d1[0] + ror r0, r4, #6 + eor r1, r5, r6 + eor r0, r0, r4, ror #11 + and r1, r1, r4 + eor r0, r0, r4, ror #25 + eor r1, r1, r6 + add r7, r7, r0 + add r7, r7, r1 + ldr r0, [r12, #8] + add r7, r7, r10 + add r7, r7, r0 + add r3, r3, r7 + ror r0, r8, #2 + eor r1, r8, r9 + eor r0, r0, r8, ror #13 + eor r10, r9, r2 + and r1, r1, r10 + eor r0, r0, r8, ror #22 + eor r1, r1, r9 + add r7, r7, r0 + add r7, r7, r1 + # Round 3 + vmov r10, d1[1] + ror r0, r3, #6 + eor r1, r4, r5 + eor r0, r0, r3, ror #11 + and r1, r1, r3 + eor r0, r0, r3, ror #25 + eor r1, r1, r5 + add r6, r6, r0 + add r6, r6, r1 + ldr r0, [r12, #12] + add r6, r6, r10 + add r6, r6, r0 + add r2, r2, r6 + ror r0, r7, #2 + eor r1, r7, r8 + eor r0, r0, r7, ror #13 + eor r10, r8, r9 + and r1, r1, r10 + eor r0, r0, r7, ror #22 + eor r1, r1, r8 + add r6, r6, r0 + add r6, r6, r1 + # Round 4 + vmov r10, d2[0] + ror r0, r2, #6 + eor r1, r3, r4 + eor r0, r0, r2, ror #11 + and r1, r1, r2 + eor r0, r0, r2, ror #25 + eor r1, r1, r4 + add r5, r5, r0 + add r5, r5, r1 + ldr r0, [r12, #16] + add r5, r5, r10 + add r5, r5, r0 + add r9, r9, r5 + ror r0, r6, #2 + eor r1, r6, r7 + eor r0, r0, r6, ror #13 + eor r10, r7, r8 + and r1, r1, r10 + eor r0, r0, r6, ror #22 + eor r1, r1, r7 + add r5, r5, r0 + add r5, r5, r1 + # Round 5 + vmov r10, d2[1] + ror r0, r9, #6 + eor r1, r2, r3 + eor r0, r0, r9, ror #11 + and r1, r1, r9 + eor r0, r0, r9, ror #25 + eor r1, r1, r3 + add r4, r4, r0 + add r4, r4, r1 + ldr r0, [r12, #20] + add r4, r4, r10 + add r4, r4, r0 + add r8, r8, r4 + ror r0, r5, #2 + eor r1, r5, r6 + eor r0, r0, r5, ror #13 + eor r10, r6, r7 + and r1, r1, r10 + eor r0, r0, r5, ror #22 + eor r1, r1, r6 + add r4, r4, r0 + add r4, r4, r1 + # Round 6 + vmov r10, d3[0] + ror r0, r8, #6 + eor r1, r9, r2 + eor r0, r0, r8, ror #11 + and r1, r1, r8 + eor r0, r0, r8, ror #25 + eor r1, r1, r2 + add r3, r3, r0 + add r3, r3, r1 + ldr r0, [r12, #24] + add r3, r3, r10 + add r3, r3, r0 + add r7, r7, r3 + ror r0, r4, #2 + eor r1, r4, r5 + eor r0, r0, r4, ror #13 + eor r10, r5, r6 + and r1, r1, r10 + eor r0, r0, r4, ror #22 + eor r1, r1, r5 + add r3, r3, r0 + add r3, r3, r1 + # Round 7 + vmov r10, d3[1] + ror r0, r7, #6 + eor r1, r8, r9 + eor r0, r0, r7, ror #11 + and r1, r1, r7 + eor r0, r0, r7, ror #25 + eor r1, r1, r9 + add r2, r2, r0 + add r2, r2, r1 + ldr r0, [r12, #28] + add r2, r2, r10 + add r2, r2, r0 + add r6, r6, r2 + ror r0, r3, #2 + eor r1, r3, r4 + eor r0, r0, r3, ror #13 + eor r10, r4, r5 + and r1, r1, r10 + eor r0, r0, r3, ror #22 + eor r1, r1, r4 + add r2, r2, r0 + add r2, r2, r1 + # Round 8 + vmov r10, d4[0] + ror r0, r6, #6 + eor r1, r7, r8 + eor r0, r0, r6, ror #11 + and r1, r1, r6 + eor r0, r0, r6, ror #25 + eor r1, r1, r8 + add r9, r9, r0 + add r9, r9, r1 + ldr r0, [r12, #32] + add r9, r9, r10 + add r9, r9, r0 + add r5, r5, r9 + ror r0, r2, #2 + eor r1, r2, r3 + eor r0, r0, r2, ror #13 + eor r10, r3, r4 + and r1, r1, r10 + eor r0, r0, r2, ror #22 + eor r1, r1, r3 + add r9, r9, r0 + add r9, r9, r1 + # Round 9 + vmov r10, d4[1] + ror r0, r5, #6 + eor r1, r6, r7 + eor r0, r0, r5, ror #11 + and r1, r1, r5 + eor r0, r0, r5, ror #25 + eor r1, r1, r7 + add r8, r8, r0 + add r8, r8, r1 + ldr r0, [r12, #36] + add r8, r8, r10 + add r8, r8, r0 + add r4, r4, r8 + ror r0, r9, #2 + eor r1, r9, r2 + eor r0, r0, r9, ror #13 + eor r10, r2, r3 + and r1, r1, r10 + eor r0, r0, r9, ror #22 + eor r1, r1, r2 + add r8, r8, r0 + add r8, r8, r1 + # Round 10 + vmov r10, d5[0] + ror r0, r4, #6 + eor r1, r5, r6 + eor r0, r0, r4, ror #11 + and r1, r1, r4 + eor r0, r0, r4, ror #25 + eor r1, r1, r6 + add r7, r7, r0 + add r7, r7, r1 + ldr r0, [r12, #40] + add r7, r7, r10 + add r7, r7, r0 + add r3, r3, r7 + ror r0, r8, #2 + eor r1, r8, r9 + eor r0, r0, r8, ror #13 + eor r10, r9, r2 + and r1, r1, r10 + eor r0, r0, r8, ror #22 + eor r1, r1, r9 + add r7, r7, r0 + add r7, r7, r1 + # Round 11 + vmov r10, d5[1] + ror r0, r3, #6 + eor r1, r4, r5 + eor r0, r0, r3, ror #11 + and r1, r1, r3 + eor r0, r0, r3, ror #25 + eor r1, r1, r5 + add r6, r6, r0 + add r6, r6, r1 + ldr r0, [r12, #44] + add r6, r6, r10 + add r6, r6, r0 + add r2, r2, r6 + ror r0, r7, #2 + eor r1, r7, r8 + eor r0, r0, r7, ror #13 + eor r10, r8, r9 + and r1, r1, r10 + eor r0, r0, r7, ror #22 + eor r1, r1, r8 + add r6, r6, r0 + add r6, r6, r1 + # Round 12 + vmov r10, d6[0] + ror r0, r2, #6 + eor r1, r3, r4 + eor r0, r0, r2, ror #11 + and r1, r1, r2 + eor r0, r0, r2, ror #25 + eor r1, r1, r4 + add r5, r5, r0 + add r5, r5, r1 + ldr r0, [r12, #48] + add r5, r5, r10 + add r5, r5, r0 + add r9, r9, r5 + ror r0, r6, #2 + eor r1, r6, r7 + eor r0, r0, r6, ror #13 + eor r10, r7, r8 + and r1, r1, r10 + eor r0, r0, r6, ror #22 + eor r1, r1, r7 + add r5, r5, r0 + add r5, r5, r1 + # Round 13 + vmov r10, d6[1] + ror r0, r9, #6 + eor r1, r2, r3 + eor r0, r0, r9, ror #11 + and r1, r1, r9 + eor r0, r0, r9, ror #25 + eor r1, r1, r3 + add r4, r4, r0 + add r4, r4, r1 + ldr r0, [r12, #52] + add r4, r4, r10 + add r4, r4, r0 + add r8, r8, r4 + ror r0, r5, #2 + eor r1, r5, r6 + eor r0, r0, r5, ror #13 + eor r10, r6, r7 + and r1, r1, r10 + eor r0, r0, r5, ror #22 + eor r1, r1, r6 + add r4, r4, r0 + add r4, r4, r1 + # Round 14 + vmov r10, d7[0] + ror r0, r8, #6 + eor r1, r9, r2 + eor r0, r0, r8, ror #11 + and r1, r1, r8 + eor r0, r0, r8, ror #25 + eor r1, r1, r2 + add r3, r3, r0 + add r3, r3, r1 + ldr r0, [r12, #56] + add r3, r3, r10 + add r3, r3, r0 + add r7, r7, r3 + ror r0, r4, #2 + eor r1, r4, r5 + eor r0, r0, r4, ror #13 + eor r10, r5, r6 + and r1, r1, r10 + eor r0, r0, r4, ror #22 + eor r1, r1, r5 + add r3, r3, r0 + add r3, r3, r1 + # Round 15 + vmov r10, d7[1] + ror r0, r7, #6 + eor r1, r8, r9 + eor r0, r0, r7, ror #11 + and r1, r1, r7 + eor r0, r0, r7, ror #25 + eor r1, r1, r9 + add r2, r2, r0 + add r2, r2, r1 + ldr r0, [r12, #60] + add r2, r2, r10 + add r2, r2, r0 + add r6, r6, r2 + ror r0, r3, #2 + eor r1, r3, r4 + eor r0, r0, r3, ror #13 + eor r10, r4, r5 + and r1, r1, r10 + eor r0, r0, r3, ror #22 + eor r1, r1, r4 + add r2, r2, r0 + add r2, r2, r1 + ldr r10, [sp] + # Add in digest from start + ldrd r0, r1, [r10] + add r2, r2, r0 + add r3, r3, r1 + strd r2, r3, [r10] + ldrd r0, r1, [r10, #8] + add r4, r4, r0 + add r5, r5, r1 + strd r4, r5, [r10, #8] + ldrd r0, r1, [r10, #16] + add r6, r6, r0 + add r7, r7, r1 + strd r6, r7, [r10, #16] + ldrd r0, r1, [r10, #24] + add r8, r8, r0 + add r9, r9, r1 + strd r8, r9, [r10, #24] + ldr r10, [sp, #8] + ldr r1, [sp, #4] + subs r10, r10, #0x40 + sub r12, r12, #0xc0 + str r10, [sp, #8] + bne L_SHA256_transform_neon_len_begin + add sp, sp, #24 + vpop {d8-d11} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size Transform_Sha256_Len,.-Transform_Sha256_Len +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* !NO_SHA256 */ +#endif /* !__aarch64__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c new file mode 100644 index 000000000..73a53027c --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -0,0 +1,2499 @@ +/* armv8-32-sha256-asm + * + * Copyright (C) 2006-2021 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha256.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha256-asm.c + */ + +#include + +#ifdef WOLFSSL_ARMASM +#ifndef __aarch64__ +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#ifndef NO_SHA256 +#include + +#ifdef WOLFSSL_ARMASM_NO_NEON +static const uint32_t L_SHA256_transform_len_k[] = { + 0x428a2f98, + 0x71374491, + 0xb5c0fbcf, + 0xe9b5dba5, + 0x3956c25b, + 0x59f111f1, + 0x923f82a4, + 0xab1c5ed5, + 0xd807aa98, + 0x12835b01, + 0x243185be, + 0x550c7dc3, + 0x72be5d74, + 0x80deb1fe, + 0x9bdc06a7, + 0xc19bf174, + 0xe49b69c1, + 0xefbe4786, + 0xfc19dc6, + 0x240ca1cc, + 0x2de92c6f, + 0x4a7484aa, + 0x5cb0a9dc, + 0x76f988da, + 0x983e5152, + 0xa831c66d, + 0xb00327c8, + 0xbf597fc7, + 0xc6e00bf3, + 0xd5a79147, + 0x6ca6351, + 0x14292967, + 0x27b70a85, + 0x2e1b2138, + 0x4d2c6dfc, + 0x53380d13, + 0x650a7354, + 0x766a0abb, + 0x81c2c92e, + 0x92722c85, + 0xa2bfe8a1, + 0xa81a664b, + 0xc24b8b70, + 0xc76c51a3, + 0xd192e819, + 0xd6990624, + 0xf40e3585, + 0x106aa070, + 0x19a4c116, + 0x1e376c08, + 0x2748774c, + 0x34b0bcb5, + 0x391c0cb3, + 0x4ed8aa4a, + 0x5b9cca4f, + 0x682e6ff3, + 0x748f82ee, + 0x78a5636f, + 0x84c87814, + 0x8cc70208, + 0x90befffa, + 0xa4506ceb, + 0xbef9a3f7, + 0xc67178f2, +}; + +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0xc0\n\t" + "mov r3, %[L_SHA256_transform_len_k]\n\t" + /* Copy digest to add in at end */ + "ldrd r12, lr, [%[sha256]]\n\t" + "ldrd r4, r5, [%[sha256], #8]\n\t" + "ldrd r6, r7, [%[sha256], #16]\n\t" + "ldrd r8, r9, [%[sha256], #24]\n\t" + "strd r12, lr, [sp, #64]\n\t" + "strd r4, r5, [sp, #72]\n\t" + "strd r6, r7, [sp, #80]\n\t" + "strd r8, r9, [sp, #88]\n\t" + /* Start of loop processing a block */ + "\n" + "L_SHA256_transform_len_begin_%=: \n\t" + /* Load, Reverse and Store W - 64 bytes */ + "ldrd r12, lr, [%[data]]\n\t" + "ldrd r4, r5, [%[data], #8]\n\t" + "ldrd r6, r7, [%[data], #16]\n\t" + "ldrd r8, r9, [%[data], #24]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "strd r12, lr, [sp]\n\t" + "strd r4, r5, [sp, #8]\n\t" + "strd r6, r7, [sp, #16]\n\t" + "strd r8, r9, [sp, #24]\n\t" + "ldrd r12, lr, [%[data], #32]\n\t" + "ldrd r4, r5, [%[data], #40]\n\t" + "ldrd r6, r7, [%[data], #48]\n\t" + "ldrd r8, r9, [%[data], #56]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "strd r12, lr, [sp, #32]\n\t" + "strd r4, r5, [sp, #40]\n\t" + "strd r6, r7, [sp, #48]\n\t" + "strd r8, r9, [sp, #56]\n\t" + "ldr r9, [%[sha256], #4]\n\t" + "ldr r12, [%[sha256], #8]\n\t" + "eor r9, r9, r12\n\t" + "mov r10, #3\n\t" + /* Start of 16 rounds */ + "\n" + "L_SHA256_transform_len_start_%=: \n\t" + /* Round 0 */ + "ldr lr, [%[sha256], #16]\n\t" + "ldr r4, [%[sha256], #20]\n\t" + "ldr r5, [%[sha256], #24]\n\t" + "ldr r7, [%[sha256], #28]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp]\n\t" + "ldr r4, [r3]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256]]\n\t" + "ldr r4, [%[sha256], #4]\n\t" + "ldr r5, [%[sha256], #8]\n\t" + "ldr r6, [%[sha256], #12]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #12]\n\t" + "str r7, [%[sha256], #28]\n\t" + /* Calc new W[0] */ + "ldr r4, [sp, #56]\n\t" + "ldr r5, [sp, #36]\n\t" + "ldr r6, [sp, #4]\n\t" + "ldr r7, [sp]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp]\n\t" + /* Round 1 */ + "ldr lr, [%[sha256], #12]\n\t" + "ldr r4, [%[sha256], #16]\n\t" + "ldr r5, [%[sha256], #20]\n\t" + "ldr r7, [%[sha256], #24]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #4]\n\t" + "ldr r4, [r3, #4]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #28]\n\t" + "ldr r4, [%[sha256]]\n\t" + "ldr r5, [%[sha256], #4]\n\t" + "ldr r6, [%[sha256], #8]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #8]\n\t" + "str r7, [%[sha256], #24]\n\t" + /* Calc new W[1] */ + "ldr r4, [sp, #60]\n\t" + "ldr r5, [sp, #40]\n\t" + "ldr r6, [sp, #8]\n\t" + "ldr r7, [sp, #4]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #4]\n\t" + /* Round 2 */ + "ldr lr, [%[sha256], #8]\n\t" + "ldr r4, [%[sha256], #12]\n\t" + "ldr r5, [%[sha256], #16]\n\t" + "ldr r7, [%[sha256], #20]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #8]\n\t" + "ldr r4, [r3, #8]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #24]\n\t" + "ldr r4, [%[sha256], #28]\n\t" + "ldr r5, [%[sha256]]\n\t" + "ldr r6, [%[sha256], #4]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #4]\n\t" + "str r7, [%[sha256], #20]\n\t" + /* Calc new W[2] */ + "ldr r4, [sp]\n\t" + "ldr r5, [sp, #44]\n\t" + "ldr r6, [sp, #12]\n\t" + "ldr r7, [sp, #8]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #8]\n\t" + /* Round 3 */ + "ldr lr, [%[sha256], #4]\n\t" + "ldr r4, [%[sha256], #8]\n\t" + "ldr r5, [%[sha256], #12]\n\t" + "ldr r7, [%[sha256], #16]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #12]\n\t" + "ldr r4, [r3, #12]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #20]\n\t" + "ldr r4, [%[sha256], #24]\n\t" + "ldr r5, [%[sha256], #28]\n\t" + "ldr r6, [%[sha256]]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256]]\n\t" + "str r7, [%[sha256], #16]\n\t" + /* Calc new W[3] */ + "ldr r4, [sp, #4]\n\t" + "ldr r5, [sp, #48]\n\t" + "ldr r6, [sp, #16]\n\t" + "ldr r7, [sp, #12]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #12]\n\t" + /* Round 4 */ + "ldr lr, [%[sha256]]\n\t" + "ldr r4, [%[sha256], #4]\n\t" + "ldr r5, [%[sha256], #8]\n\t" + "ldr r7, [%[sha256], #12]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #16]\n\t" + "ldr r4, [r3, #16]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #16]\n\t" + "ldr r4, [%[sha256], #20]\n\t" + "ldr r5, [%[sha256], #24]\n\t" + "ldr r6, [%[sha256], #28]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #28]\n\t" + "str r7, [%[sha256], #12]\n\t" + /* Calc new W[4] */ + "ldr r4, [sp, #8]\n\t" + "ldr r5, [sp, #52]\n\t" + "ldr r6, [sp, #20]\n\t" + "ldr r7, [sp, #16]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #16]\n\t" + /* Round 5 */ + "ldr lr, [%[sha256], #28]\n\t" + "ldr r4, [%[sha256]]\n\t" + "ldr r5, [%[sha256], #4]\n\t" + "ldr r7, [%[sha256], #8]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #20]\n\t" + "ldr r4, [r3, #20]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #12]\n\t" + "ldr r4, [%[sha256], #16]\n\t" + "ldr r5, [%[sha256], #20]\n\t" + "ldr r6, [%[sha256], #24]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #24]\n\t" + "str r7, [%[sha256], #8]\n\t" + /* Calc new W[5] */ + "ldr r4, [sp, #12]\n\t" + "ldr r5, [sp, #56]\n\t" + "ldr r6, [sp, #24]\n\t" + "ldr r7, [sp, #20]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #20]\n\t" + /* Round 6 */ + "ldr lr, [%[sha256], #24]\n\t" + "ldr r4, [%[sha256], #28]\n\t" + "ldr r5, [%[sha256]]\n\t" + "ldr r7, [%[sha256], #4]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #24]\n\t" + "ldr r4, [r3, #24]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #8]\n\t" + "ldr r4, [%[sha256], #12]\n\t" + "ldr r5, [%[sha256], #16]\n\t" + "ldr r6, [%[sha256], #20]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #20]\n\t" + "str r7, [%[sha256], #4]\n\t" + /* Calc new W[6] */ + "ldr r4, [sp, #16]\n\t" + "ldr r5, [sp, #60]\n\t" + "ldr r6, [sp, #28]\n\t" + "ldr r7, [sp, #24]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #24]\n\t" + /* Round 7 */ + "ldr lr, [%[sha256], #20]\n\t" + "ldr r4, [%[sha256], #24]\n\t" + "ldr r5, [%[sha256], #28]\n\t" + "ldr r7, [%[sha256]]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #28]\n\t" + "ldr r4, [r3, #28]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #4]\n\t" + "ldr r4, [%[sha256], #8]\n\t" + "ldr r5, [%[sha256], #12]\n\t" + "ldr r6, [%[sha256], #16]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #16]\n\t" + "str r7, [%[sha256]]\n\t" + /* Calc new W[7] */ + "ldr r4, [sp, #20]\n\t" + "ldr r5, [sp]\n\t" + "ldr r6, [sp, #32]\n\t" + "ldr r7, [sp, #28]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #28]\n\t" + /* Round 8 */ + "ldr lr, [%[sha256], #16]\n\t" + "ldr r4, [%[sha256], #20]\n\t" + "ldr r5, [%[sha256], #24]\n\t" + "ldr r7, [%[sha256], #28]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #32]\n\t" + "ldr r4, [r3, #32]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256]]\n\t" + "ldr r4, [%[sha256], #4]\n\t" + "ldr r5, [%[sha256], #8]\n\t" + "ldr r6, [%[sha256], #12]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #12]\n\t" + "str r7, [%[sha256], #28]\n\t" + /* Calc new W[8] */ + "ldr r4, [sp, #24]\n\t" + "ldr r5, [sp, #4]\n\t" + "ldr r6, [sp, #36]\n\t" + "ldr r7, [sp, #32]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #32]\n\t" + /* Round 9 */ + "ldr lr, [%[sha256], #12]\n\t" + "ldr r4, [%[sha256], #16]\n\t" + "ldr r5, [%[sha256], #20]\n\t" + "ldr r7, [%[sha256], #24]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #36]\n\t" + "ldr r4, [r3, #36]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #28]\n\t" + "ldr r4, [%[sha256]]\n\t" + "ldr r5, [%[sha256], #4]\n\t" + "ldr r6, [%[sha256], #8]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #8]\n\t" + "str r7, [%[sha256], #24]\n\t" + /* Calc new W[9] */ + "ldr r4, [sp, #28]\n\t" + "ldr r5, [sp, #8]\n\t" + "ldr r6, [sp, #40]\n\t" + "ldr r7, [sp, #36]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #36]\n\t" + /* Round 10 */ + "ldr lr, [%[sha256], #8]\n\t" + "ldr r4, [%[sha256], #12]\n\t" + "ldr r5, [%[sha256], #16]\n\t" + "ldr r7, [%[sha256], #20]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #40]\n\t" + "ldr r4, [r3, #40]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #24]\n\t" + "ldr r4, [%[sha256], #28]\n\t" + "ldr r5, [%[sha256]]\n\t" + "ldr r6, [%[sha256], #4]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #4]\n\t" + "str r7, [%[sha256], #20]\n\t" + /* Calc new W[10] */ + "ldr r4, [sp, #32]\n\t" + "ldr r5, [sp, #12]\n\t" + "ldr r6, [sp, #44]\n\t" + "ldr r7, [sp, #40]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #40]\n\t" + /* Round 11 */ + "ldr lr, [%[sha256], #4]\n\t" + "ldr r4, [%[sha256], #8]\n\t" + "ldr r5, [%[sha256], #12]\n\t" + "ldr r7, [%[sha256], #16]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #44]\n\t" + "ldr r4, [r3, #44]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #20]\n\t" + "ldr r4, [%[sha256], #24]\n\t" + "ldr r5, [%[sha256], #28]\n\t" + "ldr r6, [%[sha256]]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256]]\n\t" + "str r7, [%[sha256], #16]\n\t" + /* Calc new W[11] */ + "ldr r4, [sp, #36]\n\t" + "ldr r5, [sp, #16]\n\t" + "ldr r6, [sp, #48]\n\t" + "ldr r7, [sp, #44]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #44]\n\t" + /* Round 12 */ + "ldr lr, [%[sha256]]\n\t" + "ldr r4, [%[sha256], #4]\n\t" + "ldr r5, [%[sha256], #8]\n\t" + "ldr r7, [%[sha256], #12]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #48]\n\t" + "ldr r4, [r3, #48]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #16]\n\t" + "ldr r4, [%[sha256], #20]\n\t" + "ldr r5, [%[sha256], #24]\n\t" + "ldr r6, [%[sha256], #28]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #28]\n\t" + "str r7, [%[sha256], #12]\n\t" + /* Calc new W[12] */ + "ldr r4, [sp, #40]\n\t" + "ldr r5, [sp, #20]\n\t" + "ldr r6, [sp, #52]\n\t" + "ldr r7, [sp, #48]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #48]\n\t" + /* Round 13 */ + "ldr lr, [%[sha256], #28]\n\t" + "ldr r4, [%[sha256]]\n\t" + "ldr r5, [%[sha256], #4]\n\t" + "ldr r7, [%[sha256], #8]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #52]\n\t" + "ldr r4, [r3, #52]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #12]\n\t" + "ldr r4, [%[sha256], #16]\n\t" + "ldr r5, [%[sha256], #20]\n\t" + "ldr r6, [%[sha256], #24]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #24]\n\t" + "str r7, [%[sha256], #8]\n\t" + /* Calc new W[13] */ + "ldr r4, [sp, #44]\n\t" + "ldr r5, [sp, #24]\n\t" + "ldr r6, [sp, #56]\n\t" + "ldr r7, [sp, #52]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #52]\n\t" + /* Round 14 */ + "ldr lr, [%[sha256], #24]\n\t" + "ldr r4, [%[sha256], #28]\n\t" + "ldr r5, [%[sha256]]\n\t" + "ldr r7, [%[sha256], #4]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #56]\n\t" + "ldr r4, [r3, #56]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #8]\n\t" + "ldr r4, [%[sha256], #12]\n\t" + "ldr r5, [%[sha256], #16]\n\t" + "ldr r6, [%[sha256], #20]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #20]\n\t" + "str r7, [%[sha256], #4]\n\t" + /* Calc new W[14] */ + "ldr r4, [sp, #48]\n\t" + "ldr r5, [sp, #28]\n\t" + "ldr r6, [sp, #60]\n\t" + "ldr r7, [sp, #56]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #56]\n\t" + /* Round 15 */ + "ldr lr, [%[sha256], #20]\n\t" + "ldr r4, [%[sha256], #24]\n\t" + "ldr r5, [%[sha256], #28]\n\t" + "ldr r7, [%[sha256]]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #60]\n\t" + "ldr r4, [r3, #60]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #4]\n\t" + "ldr r4, [%[sha256], #8]\n\t" + "ldr r5, [%[sha256], #12]\n\t" + "ldr r6, [%[sha256], #16]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #16]\n\t" + "str r7, [%[sha256]]\n\t" + /* Calc new W[15] */ + "ldr r4, [sp, #52]\n\t" + "ldr r5, [sp, #32]\n\t" + "ldr r6, [sp]\n\t" + "ldr r7, [sp, #60]\n\t" + "ror r12, r4, #17\n\t" + "ror lr, r6, #7\n\t" + "eor r12, r12, r4, ror 19\n\t" + "eor lr, lr, r6, ror 18\n\t" + "eor r12, r12, r4, lsr 10\n\t" + "eor lr, lr, r6, lsr 3\n\t" + "add r7, r7, r5\n\t" + "add r12, r12, lr\n\t" + "add r7, r7, r12\n\t" + "str r7, [sp, #60]\n\t" + "add r3, r3, #0x40\n\t" + "subs r10, r10, #1\n\t" + "bne L_SHA256_transform_len_start_%=\n\t" + /* Round 0 */ + "ldr lr, [%[sha256], #16]\n\t" + "ldr r4, [%[sha256], #20]\n\t" + "ldr r5, [%[sha256], #24]\n\t" + "ldr r7, [%[sha256], #28]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp]\n\t" + "ldr r4, [r3]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256]]\n\t" + "ldr r4, [%[sha256], #4]\n\t" + "ldr r5, [%[sha256], #8]\n\t" + "ldr r6, [%[sha256], #12]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #12]\n\t" + "str r7, [%[sha256], #28]\n\t" + /* Round 1 */ + "ldr lr, [%[sha256], #12]\n\t" + "ldr r4, [%[sha256], #16]\n\t" + "ldr r5, [%[sha256], #20]\n\t" + "ldr r7, [%[sha256], #24]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #4]\n\t" + "ldr r4, [r3, #4]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #28]\n\t" + "ldr r4, [%[sha256]]\n\t" + "ldr r5, [%[sha256], #4]\n\t" + "ldr r6, [%[sha256], #8]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #8]\n\t" + "str r7, [%[sha256], #24]\n\t" + /* Round 2 */ + "ldr lr, [%[sha256], #8]\n\t" + "ldr r4, [%[sha256], #12]\n\t" + "ldr r5, [%[sha256], #16]\n\t" + "ldr r7, [%[sha256], #20]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #8]\n\t" + "ldr r4, [r3, #8]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #24]\n\t" + "ldr r4, [%[sha256], #28]\n\t" + "ldr r5, [%[sha256]]\n\t" + "ldr r6, [%[sha256], #4]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #4]\n\t" + "str r7, [%[sha256], #20]\n\t" + /* Round 3 */ + "ldr lr, [%[sha256], #4]\n\t" + "ldr r4, [%[sha256], #8]\n\t" + "ldr r5, [%[sha256], #12]\n\t" + "ldr r7, [%[sha256], #16]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #12]\n\t" + "ldr r4, [r3, #12]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #20]\n\t" + "ldr r4, [%[sha256], #24]\n\t" + "ldr r5, [%[sha256], #28]\n\t" + "ldr r6, [%[sha256]]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256]]\n\t" + "str r7, [%[sha256], #16]\n\t" + /* Round 4 */ + "ldr lr, [%[sha256]]\n\t" + "ldr r4, [%[sha256], #4]\n\t" + "ldr r5, [%[sha256], #8]\n\t" + "ldr r7, [%[sha256], #12]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #16]\n\t" + "ldr r4, [r3, #16]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #16]\n\t" + "ldr r4, [%[sha256], #20]\n\t" + "ldr r5, [%[sha256], #24]\n\t" + "ldr r6, [%[sha256], #28]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #28]\n\t" + "str r7, [%[sha256], #12]\n\t" + /* Round 5 */ + "ldr lr, [%[sha256], #28]\n\t" + "ldr r4, [%[sha256]]\n\t" + "ldr r5, [%[sha256], #4]\n\t" + "ldr r7, [%[sha256], #8]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #20]\n\t" + "ldr r4, [r3, #20]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #12]\n\t" + "ldr r4, [%[sha256], #16]\n\t" + "ldr r5, [%[sha256], #20]\n\t" + "ldr r6, [%[sha256], #24]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #24]\n\t" + "str r7, [%[sha256], #8]\n\t" + /* Round 6 */ + "ldr lr, [%[sha256], #24]\n\t" + "ldr r4, [%[sha256], #28]\n\t" + "ldr r5, [%[sha256]]\n\t" + "ldr r7, [%[sha256], #4]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #24]\n\t" + "ldr r4, [r3, #24]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #8]\n\t" + "ldr r4, [%[sha256], #12]\n\t" + "ldr r5, [%[sha256], #16]\n\t" + "ldr r6, [%[sha256], #20]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #20]\n\t" + "str r7, [%[sha256], #4]\n\t" + /* Round 7 */ + "ldr lr, [%[sha256], #20]\n\t" + "ldr r4, [%[sha256], #24]\n\t" + "ldr r5, [%[sha256], #28]\n\t" + "ldr r7, [%[sha256]]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #28]\n\t" + "ldr r4, [r3, #28]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #4]\n\t" + "ldr r4, [%[sha256], #8]\n\t" + "ldr r5, [%[sha256], #12]\n\t" + "ldr r6, [%[sha256], #16]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #16]\n\t" + "str r7, [%[sha256]]\n\t" + /* Round 8 */ + "ldr lr, [%[sha256], #16]\n\t" + "ldr r4, [%[sha256], #20]\n\t" + "ldr r5, [%[sha256], #24]\n\t" + "ldr r7, [%[sha256], #28]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #32]\n\t" + "ldr r4, [r3, #32]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256]]\n\t" + "ldr r4, [%[sha256], #4]\n\t" + "ldr r5, [%[sha256], #8]\n\t" + "ldr r6, [%[sha256], #12]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #12]\n\t" + "str r7, [%[sha256], #28]\n\t" + /* Round 9 */ + "ldr lr, [%[sha256], #12]\n\t" + "ldr r4, [%[sha256], #16]\n\t" + "ldr r5, [%[sha256], #20]\n\t" + "ldr r7, [%[sha256], #24]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #36]\n\t" + "ldr r4, [r3, #36]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #28]\n\t" + "ldr r4, [%[sha256]]\n\t" + "ldr r5, [%[sha256], #4]\n\t" + "ldr r6, [%[sha256], #8]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #8]\n\t" + "str r7, [%[sha256], #24]\n\t" + /* Round 10 */ + "ldr lr, [%[sha256], #8]\n\t" + "ldr r4, [%[sha256], #12]\n\t" + "ldr r5, [%[sha256], #16]\n\t" + "ldr r7, [%[sha256], #20]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #40]\n\t" + "ldr r4, [r3, #40]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #24]\n\t" + "ldr r4, [%[sha256], #28]\n\t" + "ldr r5, [%[sha256]]\n\t" + "ldr r6, [%[sha256], #4]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #4]\n\t" + "str r7, [%[sha256], #20]\n\t" + /* Round 11 */ + "ldr lr, [%[sha256], #4]\n\t" + "ldr r4, [%[sha256], #8]\n\t" + "ldr r5, [%[sha256], #12]\n\t" + "ldr r7, [%[sha256], #16]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #44]\n\t" + "ldr r4, [r3, #44]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #20]\n\t" + "ldr r4, [%[sha256], #24]\n\t" + "ldr r5, [%[sha256], #28]\n\t" + "ldr r6, [%[sha256]]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256]]\n\t" + "str r7, [%[sha256], #16]\n\t" + /* Round 12 */ + "ldr lr, [%[sha256]]\n\t" + "ldr r4, [%[sha256], #4]\n\t" + "ldr r5, [%[sha256], #8]\n\t" + "ldr r7, [%[sha256], #12]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #48]\n\t" + "ldr r4, [r3, #48]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #16]\n\t" + "ldr r4, [%[sha256], #20]\n\t" + "ldr r5, [%[sha256], #24]\n\t" + "ldr r6, [%[sha256], #28]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #28]\n\t" + "str r7, [%[sha256], #12]\n\t" + /* Round 13 */ + "ldr lr, [%[sha256], #28]\n\t" + "ldr r4, [%[sha256]]\n\t" + "ldr r5, [%[sha256], #4]\n\t" + "ldr r7, [%[sha256], #8]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #52]\n\t" + "ldr r4, [r3, #52]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #12]\n\t" + "ldr r4, [%[sha256], #16]\n\t" + "ldr r5, [%[sha256], #20]\n\t" + "ldr r6, [%[sha256], #24]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #24]\n\t" + "str r7, [%[sha256], #8]\n\t" + /* Round 14 */ + "ldr lr, [%[sha256], #24]\n\t" + "ldr r4, [%[sha256], #28]\n\t" + "ldr r5, [%[sha256]]\n\t" + "ldr r7, [%[sha256], #4]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #56]\n\t" + "ldr r4, [r3, #56]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #8]\n\t" + "ldr r4, [%[sha256], #12]\n\t" + "ldr r5, [%[sha256], #16]\n\t" + "ldr r6, [%[sha256], #20]\n\t" + "ror r12, lr, #2\n\t" + "eor r8, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r9, r9, r8\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r9, r9, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r9\n\t" + "str r6, [%[sha256], #20]\n\t" + "str r7, [%[sha256], #4]\n\t" + /* Round 15 */ + "ldr lr, [%[sha256], #20]\n\t" + "ldr r4, [%[sha256], #24]\n\t" + "ldr r5, [%[sha256], #28]\n\t" + "ldr r7, [%[sha256]]\n\t" + "ror r12, lr, #6\n\t" + "eor r4, r4, r5\n\t" + "eor r12, r12, lr, ror 11\n\t" + "and r4, r4, lr\n\t" + "eor r12, r12, lr, ror 25\n\t" + "eor r4, r4, r5\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [sp, #60]\n\t" + "ldr r4, [r3, #60]\n\t" + "add r7, r7, lr\n\t" + "add r7, r7, r4\n\t" + "ldr lr, [%[sha256], #4]\n\t" + "ldr r4, [%[sha256], #8]\n\t" + "ldr r5, [%[sha256], #12]\n\t" + "ldr r6, [%[sha256], #16]\n\t" + "ror r12, lr, #2\n\t" + "eor r9, lr, r4\n\t" + "eor r12, r12, lr, ror 13\n\t" + "and r8, r8, r9\n\t" + "eor r12, r12, lr, ror 22\n\t" + "eor r8, r8, r4\n\t" + "add r6, r6, r7\n\t" + "add r7, r7, r12\n\t" + "add r7, r7, r8\n\t" + "str r6, [%[sha256], #16]\n\t" + "str r7, [%[sha256]]\n\t" + /* Add in digest from start */ + "ldrd r12, lr, [%[sha256]]\n\t" + "ldrd r4, r5, [%[sha256], #8]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "ldrd r8, r9, [sp, #72]\n\t" + "add r12, r12, r6\n\t" + "add lr, lr, r7\n\t" + "add r4, r4, r8\n\t" + "add r5, r5, r9\n\t" + "strd r12, lr, [%[sha256]]\n\t" + "strd r4, r5, [%[sha256], #8]\n\t" + "strd r12, lr, [sp, #64]\n\t" + "strd r4, r5, [sp, #72]\n\t" + "ldrd r12, lr, [%[sha256], #16]\n\t" + "ldrd r4, r5, [%[sha256], #24]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "ldrd r8, r9, [sp, #88]\n\t" + "add r12, r12, r6\n\t" + "add lr, lr, r7\n\t" + "add r4, r4, r8\n\t" + "add r5, r5, r9\n\t" + "strd r12, lr, [%[sha256], #16]\n\t" + "strd r4, r5, [%[sha256], #24]\n\t" + "strd r12, lr, [sp, #80]\n\t" + "strd r4, r5, [sp, #88]\n\t" + "subs %[len], %[len], #0x40\n\t" + "sub r3, r3, #0xc0\n\t" + "add %[data], %[data], #0x40\n\t" + "bne L_SHA256_transform_len_begin_%=\n\t" + "add sp, sp, #0xc0\n\t" + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len) + : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k) + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#include + +#ifndef WOLFSSL_ARMASM_NO_NEON +static const uint32_t L_SHA256_transform_neon_len_k[] = { + 0x428a2f98, + 0x71374491, + 0xb5c0fbcf, + 0xe9b5dba5, + 0x3956c25b, + 0x59f111f1, + 0x923f82a4, + 0xab1c5ed5, + 0xd807aa98, + 0x12835b01, + 0x243185be, + 0x550c7dc3, + 0x72be5d74, + 0x80deb1fe, + 0x9bdc06a7, + 0xc19bf174, + 0xe49b69c1, + 0xefbe4786, + 0xfc19dc6, + 0x240ca1cc, + 0x2de92c6f, + 0x4a7484aa, + 0x5cb0a9dc, + 0x76f988da, + 0x983e5152, + 0xa831c66d, + 0xb00327c8, + 0xbf597fc7, + 0xc6e00bf3, + 0xd5a79147, + 0x6ca6351, + 0x14292967, + 0x27b70a85, + 0x2e1b2138, + 0x4d2c6dfc, + 0x53380d13, + 0x650a7354, + 0x766a0abb, + 0x81c2c92e, + 0x92722c85, + 0xa2bfe8a1, + 0xa81a664b, + 0xc24b8b70, + 0xc76c51a3, + 0xd192e819, + 0xd6990624, + 0xf40e3585, + 0x106aa070, + 0x19a4c116, + 0x1e376c08, + 0x2748774c, + 0x34b0bcb5, + 0x391c0cb3, + 0x4ed8aa4a, + 0x5b9cca4f, + 0x682e6ff3, + 0x748f82ee, + 0x78a5636f, + 0x84c87814, + 0x8cc70208, + 0x90befffa, + 0xa4506ceb, + 0xbef9a3f7, + 0xc67178f2, +}; + +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + "sub sp, sp, #24\n\t" + "strd %[sha256], %[data], [sp]\n\t" + "str %[len], [sp, #8]\n\t" + "mov r12, %[L_SHA256_transform_neon_len_k]\n\t" + /* Load digest into registers */ + "ldrd %[len], r3, [%[sha256]]\n\t" + "ldrd r4, r5, [%[sha256], #8]\n\t" + "ldrd r6, r7, [%[sha256], #16]\n\t" + "ldrd r8, r9, [%[sha256], #24]\n\t" + /* Start of loop processing a block */ + "\n" + "L_SHA256_transform_neon_len_begin_%=: \n\t" + /* Load W */ + "vldm.32 %[data]!, {d0-d7}\n\t" + "vrev32.8 q0, q0\n\t" + "vrev32.8 q1, q1\n\t" + "vrev32.8 q2, q2\n\t" + "vrev32.8 q3, q3\n\t" + "str %[data], [sp, #4]\n\t" + "mov lr, #3\n\t" + /* Start of 16 rounds */ + "\n" + "L_SHA256_transform_neon_len_start_%=: \n\t" + /* Round 0 */ + "vmov r10, d0[0]\n\t" + "ror %[sha256], r6, #6\n\t" + "eor %[data], r7, r8\n\t" + "eor %[sha256], %[sha256], r6, ror 11\n\t" + "and %[data], %[data], r6\n\t" + "eor %[sha256], %[sha256], r6, ror 25\n\t" + "eor %[data], %[data], r8\n\t" + "add r9, r9, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + "ldr %[sha256], [r12]\n\t" + "add r9, r9, r10\n\t" + "add r9, r9, %[sha256]\n\t" + "add r5, r5, r9\n\t" + "ror %[sha256], %[len], #2\n\t" + "eor %[data], %[len], r3\n\t" + "eor %[sha256], %[sha256], %[len], ror 13\n\t" + "eor r10, r3, r4\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], %[len], ror 22\n\t" + "eor %[data], %[data], r3\n\t" + "add r9, r9, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + /* Round 1 */ + "vmov r10, d0[1]\n\t" + /* Calc new W[0]-W[1] */ + "vext.8 d10, d0, d1, #4\n\t" + "ror %[sha256], r5, #6\n\t" + "vshl.u32 d8, d7, #15\n\t" + "eor %[data], r6, r7\n\t" + "vsri.u32 d8, d7, #17\n\t" + "eor %[sha256], %[sha256], r5, ror 11\n\t" + "vshl.u32 d9, d7, #13\n\t" + "and %[data], %[data], r5\n\t" + "vsri.u32 d9, d7, #19\n\t" + "eor %[sha256], %[sha256], r5, ror 25\n\t" + "veor d9, d8\n\t" + "eor %[data], %[data], r7\n\t" + "vshr.u32 d8, d7, #10\n\t" + "add r8, r8, %[sha256]\n\t" + "veor d9, d8\n\t" + "add r8, r8, %[data]\n\t" + "vadd.i32 d0, d9\n\t" + "ldr %[sha256], [r12, #4]\n\t" + "vext.8 d11, d4, d5, #4\n\t" + "add r8, r8, r10\n\t" + "vadd.i32 d0, d11\n\t" + "add r8, r8, %[sha256]\n\t" + "vshl.u32 d8, d10, #25\n\t" + "add r4, r4, r8\n\t" + "vsri.u32 d8, d10, #7\n\t" + "ror %[sha256], r9, #2\n\t" + "vshl.u32 d9, d10, #14\n\t" + "eor %[data], r9, %[len]\n\t" + "vsri.u32 d9, d10, #18\n\t" + "eor %[sha256], %[sha256], r9, ror 13\n\t" + "veor d9, d8\n\t" + "eor r10, %[len], r3\n\t" + "vshr.u32 d10, #3\n\t" + "and %[data], %[data], r10\n\t" + "veor d9, d10\n\t" + "eor %[sha256], %[sha256], r9, ror 22\n\t" + "vadd.i32 d0, d9\n\t" + "eor %[data], %[data], %[len]\n\t" + "add r8, r8, %[sha256]\n\t" + "add r8, r8, %[data]\n\t" + /* Round 2 */ + "vmov r10, d1[0]\n\t" + "ror %[sha256], r4, #6\n\t" + "eor %[data], r5, r6\n\t" + "eor %[sha256], %[sha256], r4, ror 11\n\t" + "and %[data], %[data], r4\n\t" + "eor %[sha256], %[sha256], r4, ror 25\n\t" + "eor %[data], %[data], r6\n\t" + "add r7, r7, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + "ldr %[sha256], [r12, #8]\n\t" + "add r7, r7, r10\n\t" + "add r7, r7, %[sha256]\n\t" + "add r3, r3, r7\n\t" + "ror %[sha256], r8, #2\n\t" + "eor %[data], r8, r9\n\t" + "eor %[sha256], %[sha256], r8, ror 13\n\t" + "eor r10, r9, %[len]\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r8, ror 22\n\t" + "eor %[data], %[data], r9\n\t" + "add r7, r7, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + /* Round 3 */ + "vmov r10, d1[1]\n\t" + /* Calc new W[2]-W[3] */ + "vext.8 d10, d1, d2, #4\n\t" + "ror %[sha256], r3, #6\n\t" + "vshl.u32 d8, d0, #15\n\t" + "eor %[data], r4, r5\n\t" + "vsri.u32 d8, d0, #17\n\t" + "eor %[sha256], %[sha256], r3, ror 11\n\t" + "vshl.u32 d9, d0, #13\n\t" + "and %[data], %[data], r3\n\t" + "vsri.u32 d9, d0, #19\n\t" + "eor %[sha256], %[sha256], r3, ror 25\n\t" + "veor d9, d8\n\t" + "eor %[data], %[data], r5\n\t" + "vshr.u32 d8, d0, #10\n\t" + "add r6, r6, %[sha256]\n\t" + "veor d9, d8\n\t" + "add r6, r6, %[data]\n\t" + "vadd.i32 d1, d9\n\t" + "ldr %[sha256], [r12, #12]\n\t" + "vext.8 d11, d5, d6, #4\n\t" + "add r6, r6, r10\n\t" + "vadd.i32 d1, d11\n\t" + "add r6, r6, %[sha256]\n\t" + "vshl.u32 d8, d10, #25\n\t" + "add %[len], %[len], r6\n\t" + "vsri.u32 d8, d10, #7\n\t" + "ror %[sha256], r7, #2\n\t" + "vshl.u32 d9, d10, #14\n\t" + "eor %[data], r7, r8\n\t" + "vsri.u32 d9, d10, #18\n\t" + "eor %[sha256], %[sha256], r7, ror 13\n\t" + "veor d9, d8\n\t" + "eor r10, r8, r9\n\t" + "vshr.u32 d10, #3\n\t" + "and %[data], %[data], r10\n\t" + "veor d9, d10\n\t" + "eor %[sha256], %[sha256], r7, ror 22\n\t" + "vadd.i32 d1, d9\n\t" + "eor %[data], %[data], r8\n\t" + "add r6, r6, %[sha256]\n\t" + "add r6, r6, %[data]\n\t" + /* Round 4 */ + "vmov r10, d2[0]\n\t" + "ror %[sha256], %[len], #6\n\t" + "eor %[data], r3, r4\n\t" + "eor %[sha256], %[sha256], %[len], ror 11\n\t" + "and %[data], %[data], %[len]\n\t" + "eor %[sha256], %[sha256], %[len], ror 25\n\t" + "eor %[data], %[data], r4\n\t" + "add r5, r5, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + "ldr %[sha256], [r12, #16]\n\t" + "add r5, r5, r10\n\t" + "add r5, r5, %[sha256]\n\t" + "add r9, r9, r5\n\t" + "ror %[sha256], r6, #2\n\t" + "eor %[data], r6, r7\n\t" + "eor %[sha256], %[sha256], r6, ror 13\n\t" + "eor r10, r7, r8\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r6, ror 22\n\t" + "eor %[data], %[data], r7\n\t" + "add r5, r5, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + /* Round 5 */ + "vmov r10, d2[1]\n\t" + /* Calc new W[4]-W[5] */ + "vext.8 d10, d2, d3, #4\n\t" + "ror %[sha256], r9, #6\n\t" + "vshl.u32 d8, d1, #15\n\t" + "eor %[data], %[len], r3\n\t" + "vsri.u32 d8, d1, #17\n\t" + "eor %[sha256], %[sha256], r9, ror 11\n\t" + "vshl.u32 d9, d1, #13\n\t" + "and %[data], %[data], r9\n\t" + "vsri.u32 d9, d1, #19\n\t" + "eor %[sha256], %[sha256], r9, ror 25\n\t" + "veor d9, d8\n\t" + "eor %[data], %[data], r3\n\t" + "vshr.u32 d8, d1, #10\n\t" + "add r4, r4, %[sha256]\n\t" + "veor d9, d8\n\t" + "add r4, r4, %[data]\n\t" + "vadd.i32 d2, d9\n\t" + "ldr %[sha256], [r12, #20]\n\t" + "vext.8 d11, d6, d7, #4\n\t" + "add r4, r4, r10\n\t" + "vadd.i32 d2, d11\n\t" + "add r4, r4, %[sha256]\n\t" + "vshl.u32 d8, d10, #25\n\t" + "add r8, r8, r4\n\t" + "vsri.u32 d8, d10, #7\n\t" + "ror %[sha256], r5, #2\n\t" + "vshl.u32 d9, d10, #14\n\t" + "eor %[data], r5, r6\n\t" + "vsri.u32 d9, d10, #18\n\t" + "eor %[sha256], %[sha256], r5, ror 13\n\t" + "veor d9, d8\n\t" + "eor r10, r6, r7\n\t" + "vshr.u32 d10, #3\n\t" + "and %[data], %[data], r10\n\t" + "veor d9, d10\n\t" + "eor %[sha256], %[sha256], r5, ror 22\n\t" + "vadd.i32 d2, d9\n\t" + "eor %[data], %[data], r6\n\t" + "add r4, r4, %[sha256]\n\t" + "add r4, r4, %[data]\n\t" + /* Round 6 */ + "vmov r10, d3[0]\n\t" + "ror %[sha256], r8, #6\n\t" + "eor %[data], r9, %[len]\n\t" + "eor %[sha256], %[sha256], r8, ror 11\n\t" + "and %[data], %[data], r8\n\t" + "eor %[sha256], %[sha256], r8, ror 25\n\t" + "eor %[data], %[data], %[len]\n\t" + "add r3, r3, %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + "ldr %[sha256], [r12, #24]\n\t" + "add r3, r3, r10\n\t" + "add r3, r3, %[sha256]\n\t" + "add r7, r7, r3\n\t" + "ror %[sha256], r4, #2\n\t" + "eor %[data], r4, r5\n\t" + "eor %[sha256], %[sha256], r4, ror 13\n\t" + "eor r10, r5, r6\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r4, ror 22\n\t" + "eor %[data], %[data], r5\n\t" + "add r3, r3, %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + /* Round 7 */ + "vmov r10, d3[1]\n\t" + /* Calc new W[6]-W[7] */ + "vext.8 d10, d3, d4, #4\n\t" + "ror %[sha256], r7, #6\n\t" + "vshl.u32 d8, d2, #15\n\t" + "eor %[data], r8, r9\n\t" + "vsri.u32 d8, d2, #17\n\t" + "eor %[sha256], %[sha256], r7, ror 11\n\t" + "vshl.u32 d9, d2, #13\n\t" + "and %[data], %[data], r7\n\t" + "vsri.u32 d9, d2, #19\n\t" + "eor %[sha256], %[sha256], r7, ror 25\n\t" + "veor d9, d8\n\t" + "eor %[data], %[data], r9\n\t" + "vshr.u32 d8, d2, #10\n\t" + "add %[len], %[len], %[sha256]\n\t" + "veor d9, d8\n\t" + "add %[len], %[len], %[data]\n\t" + "vadd.i32 d3, d9\n\t" + "ldr %[sha256], [r12, #28]\n\t" + "vext.8 d11, d7, d0, #4\n\t" + "add %[len], %[len], r10\n\t" + "vadd.i32 d3, d11\n\t" + "add %[len], %[len], %[sha256]\n\t" + "vshl.u32 d8, d10, #25\n\t" + "add r6, r6, %[len]\n\t" + "vsri.u32 d8, d10, #7\n\t" + "ror %[sha256], r3, #2\n\t" + "vshl.u32 d9, d10, #14\n\t" + "eor %[data], r3, r4\n\t" + "vsri.u32 d9, d10, #18\n\t" + "eor %[sha256], %[sha256], r3, ror 13\n\t" + "veor d9, d8\n\t" + "eor r10, r4, r5\n\t" + "vshr.u32 d10, #3\n\t" + "and %[data], %[data], r10\n\t" + "veor d9, d10\n\t" + "eor %[sha256], %[sha256], r3, ror 22\n\t" + "vadd.i32 d3, d9\n\t" + "eor %[data], %[data], r4\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add %[len], %[len], %[data]\n\t" + /* Round 8 */ + "vmov r10, d4[0]\n\t" + "ror %[sha256], r6, #6\n\t" + "eor %[data], r7, r8\n\t" + "eor %[sha256], %[sha256], r6, ror 11\n\t" + "and %[data], %[data], r6\n\t" + "eor %[sha256], %[sha256], r6, ror 25\n\t" + "eor %[data], %[data], r8\n\t" + "add r9, r9, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + "ldr %[sha256], [r12, #32]\n\t" + "add r9, r9, r10\n\t" + "add r9, r9, %[sha256]\n\t" + "add r5, r5, r9\n\t" + "ror %[sha256], %[len], #2\n\t" + "eor %[data], %[len], r3\n\t" + "eor %[sha256], %[sha256], %[len], ror 13\n\t" + "eor r10, r3, r4\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], %[len], ror 22\n\t" + "eor %[data], %[data], r3\n\t" + "add r9, r9, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + /* Round 9 */ + "vmov r10, d4[1]\n\t" + /* Calc new W[8]-W[9] */ + "vext.8 d10, d4, d5, #4\n\t" + "ror %[sha256], r5, #6\n\t" + "vshl.u32 d8, d3, #15\n\t" + "eor %[data], r6, r7\n\t" + "vsri.u32 d8, d3, #17\n\t" + "eor %[sha256], %[sha256], r5, ror 11\n\t" + "vshl.u32 d9, d3, #13\n\t" + "and %[data], %[data], r5\n\t" + "vsri.u32 d9, d3, #19\n\t" + "eor %[sha256], %[sha256], r5, ror 25\n\t" + "veor d9, d8\n\t" + "eor %[data], %[data], r7\n\t" + "vshr.u32 d8, d3, #10\n\t" + "add r8, r8, %[sha256]\n\t" + "veor d9, d8\n\t" + "add r8, r8, %[data]\n\t" + "vadd.i32 d4, d9\n\t" + "ldr %[sha256], [r12, #36]\n\t" + "vext.8 d11, d0, d1, #4\n\t" + "add r8, r8, r10\n\t" + "vadd.i32 d4, d11\n\t" + "add r8, r8, %[sha256]\n\t" + "vshl.u32 d8, d10, #25\n\t" + "add r4, r4, r8\n\t" + "vsri.u32 d8, d10, #7\n\t" + "ror %[sha256], r9, #2\n\t" + "vshl.u32 d9, d10, #14\n\t" + "eor %[data], r9, %[len]\n\t" + "vsri.u32 d9, d10, #18\n\t" + "eor %[sha256], %[sha256], r9, ror 13\n\t" + "veor d9, d8\n\t" + "eor r10, %[len], r3\n\t" + "vshr.u32 d10, #3\n\t" + "and %[data], %[data], r10\n\t" + "veor d9, d10\n\t" + "eor %[sha256], %[sha256], r9, ror 22\n\t" + "vadd.i32 d4, d9\n\t" + "eor %[data], %[data], %[len]\n\t" + "add r8, r8, %[sha256]\n\t" + "add r8, r8, %[data]\n\t" + /* Round 10 */ + "vmov r10, d5[0]\n\t" + "ror %[sha256], r4, #6\n\t" + "eor %[data], r5, r6\n\t" + "eor %[sha256], %[sha256], r4, ror 11\n\t" + "and %[data], %[data], r4\n\t" + "eor %[sha256], %[sha256], r4, ror 25\n\t" + "eor %[data], %[data], r6\n\t" + "add r7, r7, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + "ldr %[sha256], [r12, #40]\n\t" + "add r7, r7, r10\n\t" + "add r7, r7, %[sha256]\n\t" + "add r3, r3, r7\n\t" + "ror %[sha256], r8, #2\n\t" + "eor %[data], r8, r9\n\t" + "eor %[sha256], %[sha256], r8, ror 13\n\t" + "eor r10, r9, %[len]\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r8, ror 22\n\t" + "eor %[data], %[data], r9\n\t" + "add r7, r7, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + /* Round 11 */ + "vmov r10, d5[1]\n\t" + /* Calc new W[10]-W[11] */ + "vext.8 d10, d5, d6, #4\n\t" + "ror %[sha256], r3, #6\n\t" + "vshl.u32 d8, d4, #15\n\t" + "eor %[data], r4, r5\n\t" + "vsri.u32 d8, d4, #17\n\t" + "eor %[sha256], %[sha256], r3, ror 11\n\t" + "vshl.u32 d9, d4, #13\n\t" + "and %[data], %[data], r3\n\t" + "vsri.u32 d9, d4, #19\n\t" + "eor %[sha256], %[sha256], r3, ror 25\n\t" + "veor d9, d8\n\t" + "eor %[data], %[data], r5\n\t" + "vshr.u32 d8, d4, #10\n\t" + "add r6, r6, %[sha256]\n\t" + "veor d9, d8\n\t" + "add r6, r6, %[data]\n\t" + "vadd.i32 d5, d9\n\t" + "ldr %[sha256], [r12, #44]\n\t" + "vext.8 d11, d1, d2, #4\n\t" + "add r6, r6, r10\n\t" + "vadd.i32 d5, d11\n\t" + "add r6, r6, %[sha256]\n\t" + "vshl.u32 d8, d10, #25\n\t" + "add %[len], %[len], r6\n\t" + "vsri.u32 d8, d10, #7\n\t" + "ror %[sha256], r7, #2\n\t" + "vshl.u32 d9, d10, #14\n\t" + "eor %[data], r7, r8\n\t" + "vsri.u32 d9, d10, #18\n\t" + "eor %[sha256], %[sha256], r7, ror 13\n\t" + "veor d9, d8\n\t" + "eor r10, r8, r9\n\t" + "vshr.u32 d10, #3\n\t" + "and %[data], %[data], r10\n\t" + "veor d9, d10\n\t" + "eor %[sha256], %[sha256], r7, ror 22\n\t" + "vadd.i32 d5, d9\n\t" + "eor %[data], %[data], r8\n\t" + "add r6, r6, %[sha256]\n\t" + "add r6, r6, %[data]\n\t" + /* Round 12 */ + "vmov r10, d6[0]\n\t" + "ror %[sha256], %[len], #6\n\t" + "eor %[data], r3, r4\n\t" + "eor %[sha256], %[sha256], %[len], ror 11\n\t" + "and %[data], %[data], %[len]\n\t" + "eor %[sha256], %[sha256], %[len], ror 25\n\t" + "eor %[data], %[data], r4\n\t" + "add r5, r5, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + "ldr %[sha256], [r12, #48]\n\t" + "add r5, r5, r10\n\t" + "add r5, r5, %[sha256]\n\t" + "add r9, r9, r5\n\t" + "ror %[sha256], r6, #2\n\t" + "eor %[data], r6, r7\n\t" + "eor %[sha256], %[sha256], r6, ror 13\n\t" + "eor r10, r7, r8\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r6, ror 22\n\t" + "eor %[data], %[data], r7\n\t" + "add r5, r5, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + /* Round 13 */ + "vmov r10, d6[1]\n\t" + /* Calc new W[12]-W[13] */ + "vext.8 d10, d6, d7, #4\n\t" + "ror %[sha256], r9, #6\n\t" + "vshl.u32 d8, d5, #15\n\t" + "eor %[data], %[len], r3\n\t" + "vsri.u32 d8, d5, #17\n\t" + "eor %[sha256], %[sha256], r9, ror 11\n\t" + "vshl.u32 d9, d5, #13\n\t" + "and %[data], %[data], r9\n\t" + "vsri.u32 d9, d5, #19\n\t" + "eor %[sha256], %[sha256], r9, ror 25\n\t" + "veor d9, d8\n\t" + "eor %[data], %[data], r3\n\t" + "vshr.u32 d8, d5, #10\n\t" + "add r4, r4, %[sha256]\n\t" + "veor d9, d8\n\t" + "add r4, r4, %[data]\n\t" + "vadd.i32 d6, d9\n\t" + "ldr %[sha256], [r12, #52]\n\t" + "vext.8 d11, d2, d3, #4\n\t" + "add r4, r4, r10\n\t" + "vadd.i32 d6, d11\n\t" + "add r4, r4, %[sha256]\n\t" + "vshl.u32 d8, d10, #25\n\t" + "add r8, r8, r4\n\t" + "vsri.u32 d8, d10, #7\n\t" + "ror %[sha256], r5, #2\n\t" + "vshl.u32 d9, d10, #14\n\t" + "eor %[data], r5, r6\n\t" + "vsri.u32 d9, d10, #18\n\t" + "eor %[sha256], %[sha256], r5, ror 13\n\t" + "veor d9, d8\n\t" + "eor r10, r6, r7\n\t" + "vshr.u32 d10, #3\n\t" + "and %[data], %[data], r10\n\t" + "veor d9, d10\n\t" + "eor %[sha256], %[sha256], r5, ror 22\n\t" + "vadd.i32 d6, d9\n\t" + "eor %[data], %[data], r6\n\t" + "add r4, r4, %[sha256]\n\t" + "add r4, r4, %[data]\n\t" + /* Round 14 */ + "vmov r10, d7[0]\n\t" + "ror %[sha256], r8, #6\n\t" + "eor %[data], r9, %[len]\n\t" + "eor %[sha256], %[sha256], r8, ror 11\n\t" + "and %[data], %[data], r8\n\t" + "eor %[sha256], %[sha256], r8, ror 25\n\t" + "eor %[data], %[data], %[len]\n\t" + "add r3, r3, %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + "ldr %[sha256], [r12, #56]\n\t" + "add r3, r3, r10\n\t" + "add r3, r3, %[sha256]\n\t" + "add r7, r7, r3\n\t" + "ror %[sha256], r4, #2\n\t" + "eor %[data], r4, r5\n\t" + "eor %[sha256], %[sha256], r4, ror 13\n\t" + "eor r10, r5, r6\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r4, ror 22\n\t" + "eor %[data], %[data], r5\n\t" + "add r3, r3, %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + /* Round 15 */ + "vmov r10, d7[1]\n\t" + /* Calc new W[14]-W[15] */ + "vext.8 d10, d7, d0, #4\n\t" + "ror %[sha256], r7, #6\n\t" + "vshl.u32 d8, d6, #15\n\t" + "eor %[data], r8, r9\n\t" + "vsri.u32 d8, d6, #17\n\t" + "eor %[sha256], %[sha256], r7, ror 11\n\t" + "vshl.u32 d9, d6, #13\n\t" + "and %[data], %[data], r7\n\t" + "vsri.u32 d9, d6, #19\n\t" + "eor %[sha256], %[sha256], r7, ror 25\n\t" + "veor d9, d8\n\t" + "eor %[data], %[data], r9\n\t" + "vshr.u32 d8, d6, #10\n\t" + "add %[len], %[len], %[sha256]\n\t" + "veor d9, d8\n\t" + "add %[len], %[len], %[data]\n\t" + "vadd.i32 d7, d9\n\t" + "ldr %[sha256], [r12, #60]\n\t" + "vext.8 d11, d3, d4, #4\n\t" + "add %[len], %[len], r10\n\t" + "vadd.i32 d7, d11\n\t" + "add %[len], %[len], %[sha256]\n\t" + "vshl.u32 d8, d10, #25\n\t" + "add r6, r6, %[len]\n\t" + "vsri.u32 d8, d10, #7\n\t" + "ror %[sha256], r3, #2\n\t" + "vshl.u32 d9, d10, #14\n\t" + "eor %[data], r3, r4\n\t" + "vsri.u32 d9, d10, #18\n\t" + "eor %[sha256], %[sha256], r3, ror 13\n\t" + "veor d9, d8\n\t" + "eor r10, r4, r5\n\t" + "vshr.u32 d10, #3\n\t" + "and %[data], %[data], r10\n\t" + "veor d9, d10\n\t" + "eor %[sha256], %[sha256], r3, ror 22\n\t" + "vadd.i32 d7, d9\n\t" + "eor %[data], %[data], r4\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add %[len], %[len], %[data]\n\t" + "add r12, r12, #0x40\n\t" + "subs lr, lr, #1\n\t" + "bne L_SHA256_transform_neon_len_start_%=\n\t" + /* Round 0 */ + "vmov r10, d0[0]\n\t" + "ror %[sha256], r6, #6\n\t" + "eor %[data], r7, r8\n\t" + "eor %[sha256], %[sha256], r6, ror 11\n\t" + "and %[data], %[data], r6\n\t" + "eor %[sha256], %[sha256], r6, ror 25\n\t" + "eor %[data], %[data], r8\n\t" + "add r9, r9, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + "ldr %[sha256], [r12]\n\t" + "add r9, r9, r10\n\t" + "add r9, r9, %[sha256]\n\t" + "add r5, r5, r9\n\t" + "ror %[sha256], %[len], #2\n\t" + "eor %[data], %[len], r3\n\t" + "eor %[sha256], %[sha256], %[len], ror 13\n\t" + "eor r10, r3, r4\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], %[len], ror 22\n\t" + "eor %[data], %[data], r3\n\t" + "add r9, r9, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + /* Round 1 */ + "vmov r10, d0[1]\n\t" + "ror %[sha256], r5, #6\n\t" + "eor %[data], r6, r7\n\t" + "eor %[sha256], %[sha256], r5, ror 11\n\t" + "and %[data], %[data], r5\n\t" + "eor %[sha256], %[sha256], r5, ror 25\n\t" + "eor %[data], %[data], r7\n\t" + "add r8, r8, %[sha256]\n\t" + "add r8, r8, %[data]\n\t" + "ldr %[sha256], [r12, #4]\n\t" + "add r8, r8, r10\n\t" + "add r8, r8, %[sha256]\n\t" + "add r4, r4, r8\n\t" + "ror %[sha256], r9, #2\n\t" + "eor %[data], r9, %[len]\n\t" + "eor %[sha256], %[sha256], r9, ror 13\n\t" + "eor r10, %[len], r3\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r9, ror 22\n\t" + "eor %[data], %[data], %[len]\n\t" + "add r8, r8, %[sha256]\n\t" + "add r8, r8, %[data]\n\t" + /* Round 2 */ + "vmov r10, d1[0]\n\t" + "ror %[sha256], r4, #6\n\t" + "eor %[data], r5, r6\n\t" + "eor %[sha256], %[sha256], r4, ror 11\n\t" + "and %[data], %[data], r4\n\t" + "eor %[sha256], %[sha256], r4, ror 25\n\t" + "eor %[data], %[data], r6\n\t" + "add r7, r7, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + "ldr %[sha256], [r12, #8]\n\t" + "add r7, r7, r10\n\t" + "add r7, r7, %[sha256]\n\t" + "add r3, r3, r7\n\t" + "ror %[sha256], r8, #2\n\t" + "eor %[data], r8, r9\n\t" + "eor %[sha256], %[sha256], r8, ror 13\n\t" + "eor r10, r9, %[len]\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r8, ror 22\n\t" + "eor %[data], %[data], r9\n\t" + "add r7, r7, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + /* Round 3 */ + "vmov r10, d1[1]\n\t" + "ror %[sha256], r3, #6\n\t" + "eor %[data], r4, r5\n\t" + "eor %[sha256], %[sha256], r3, ror 11\n\t" + "and %[data], %[data], r3\n\t" + "eor %[sha256], %[sha256], r3, ror 25\n\t" + "eor %[data], %[data], r5\n\t" + "add r6, r6, %[sha256]\n\t" + "add r6, r6, %[data]\n\t" + "ldr %[sha256], [r12, #12]\n\t" + "add r6, r6, r10\n\t" + "add r6, r6, %[sha256]\n\t" + "add %[len], %[len], r6\n\t" + "ror %[sha256], r7, #2\n\t" + "eor %[data], r7, r8\n\t" + "eor %[sha256], %[sha256], r7, ror 13\n\t" + "eor r10, r8, r9\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r7, ror 22\n\t" + "eor %[data], %[data], r8\n\t" + "add r6, r6, %[sha256]\n\t" + "add r6, r6, %[data]\n\t" + /* Round 4 */ + "vmov r10, d2[0]\n\t" + "ror %[sha256], %[len], #6\n\t" + "eor %[data], r3, r4\n\t" + "eor %[sha256], %[sha256], %[len], ror 11\n\t" + "and %[data], %[data], %[len]\n\t" + "eor %[sha256], %[sha256], %[len], ror 25\n\t" + "eor %[data], %[data], r4\n\t" + "add r5, r5, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + "ldr %[sha256], [r12, #16]\n\t" + "add r5, r5, r10\n\t" + "add r5, r5, %[sha256]\n\t" + "add r9, r9, r5\n\t" + "ror %[sha256], r6, #2\n\t" + "eor %[data], r6, r7\n\t" + "eor %[sha256], %[sha256], r6, ror 13\n\t" + "eor r10, r7, r8\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r6, ror 22\n\t" + "eor %[data], %[data], r7\n\t" + "add r5, r5, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + /* Round 5 */ + "vmov r10, d2[1]\n\t" + "ror %[sha256], r9, #6\n\t" + "eor %[data], %[len], r3\n\t" + "eor %[sha256], %[sha256], r9, ror 11\n\t" + "and %[data], %[data], r9\n\t" + "eor %[sha256], %[sha256], r9, ror 25\n\t" + "eor %[data], %[data], r3\n\t" + "add r4, r4, %[sha256]\n\t" + "add r4, r4, %[data]\n\t" + "ldr %[sha256], [r12, #20]\n\t" + "add r4, r4, r10\n\t" + "add r4, r4, %[sha256]\n\t" + "add r8, r8, r4\n\t" + "ror %[sha256], r5, #2\n\t" + "eor %[data], r5, r6\n\t" + "eor %[sha256], %[sha256], r5, ror 13\n\t" + "eor r10, r6, r7\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r5, ror 22\n\t" + "eor %[data], %[data], r6\n\t" + "add r4, r4, %[sha256]\n\t" + "add r4, r4, %[data]\n\t" + /* Round 6 */ + "vmov r10, d3[0]\n\t" + "ror %[sha256], r8, #6\n\t" + "eor %[data], r9, %[len]\n\t" + "eor %[sha256], %[sha256], r8, ror 11\n\t" + "and %[data], %[data], r8\n\t" + "eor %[sha256], %[sha256], r8, ror 25\n\t" + "eor %[data], %[data], %[len]\n\t" + "add r3, r3, %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + "ldr %[sha256], [r12, #24]\n\t" + "add r3, r3, r10\n\t" + "add r3, r3, %[sha256]\n\t" + "add r7, r7, r3\n\t" + "ror %[sha256], r4, #2\n\t" + "eor %[data], r4, r5\n\t" + "eor %[sha256], %[sha256], r4, ror 13\n\t" + "eor r10, r5, r6\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r4, ror 22\n\t" + "eor %[data], %[data], r5\n\t" + "add r3, r3, %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + /* Round 7 */ + "vmov r10, d3[1]\n\t" + "ror %[sha256], r7, #6\n\t" + "eor %[data], r8, r9\n\t" + "eor %[sha256], %[sha256], r7, ror 11\n\t" + "and %[data], %[data], r7\n\t" + "eor %[sha256], %[sha256], r7, ror 25\n\t" + "eor %[data], %[data], r9\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add %[len], %[len], %[data]\n\t" + "ldr %[sha256], [r12, #28]\n\t" + "add %[len], %[len], r10\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add r6, r6, %[len]\n\t" + "ror %[sha256], r3, #2\n\t" + "eor %[data], r3, r4\n\t" + "eor %[sha256], %[sha256], r3, ror 13\n\t" + "eor r10, r4, r5\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r3, ror 22\n\t" + "eor %[data], %[data], r4\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add %[len], %[len], %[data]\n\t" + /* Round 8 */ + "vmov r10, d4[0]\n\t" + "ror %[sha256], r6, #6\n\t" + "eor %[data], r7, r8\n\t" + "eor %[sha256], %[sha256], r6, ror 11\n\t" + "and %[data], %[data], r6\n\t" + "eor %[sha256], %[sha256], r6, ror 25\n\t" + "eor %[data], %[data], r8\n\t" + "add r9, r9, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + "ldr %[sha256], [r12, #32]\n\t" + "add r9, r9, r10\n\t" + "add r9, r9, %[sha256]\n\t" + "add r5, r5, r9\n\t" + "ror %[sha256], %[len], #2\n\t" + "eor %[data], %[len], r3\n\t" + "eor %[sha256], %[sha256], %[len], ror 13\n\t" + "eor r10, r3, r4\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], %[len], ror 22\n\t" + "eor %[data], %[data], r3\n\t" + "add r9, r9, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + /* Round 9 */ + "vmov r10, d4[1]\n\t" + "ror %[sha256], r5, #6\n\t" + "eor %[data], r6, r7\n\t" + "eor %[sha256], %[sha256], r5, ror 11\n\t" + "and %[data], %[data], r5\n\t" + "eor %[sha256], %[sha256], r5, ror 25\n\t" + "eor %[data], %[data], r7\n\t" + "add r8, r8, %[sha256]\n\t" + "add r8, r8, %[data]\n\t" + "ldr %[sha256], [r12, #36]\n\t" + "add r8, r8, r10\n\t" + "add r8, r8, %[sha256]\n\t" + "add r4, r4, r8\n\t" + "ror %[sha256], r9, #2\n\t" + "eor %[data], r9, %[len]\n\t" + "eor %[sha256], %[sha256], r9, ror 13\n\t" + "eor r10, %[len], r3\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r9, ror 22\n\t" + "eor %[data], %[data], %[len]\n\t" + "add r8, r8, %[sha256]\n\t" + "add r8, r8, %[data]\n\t" + /* Round 10 */ + "vmov r10, d5[0]\n\t" + "ror %[sha256], r4, #6\n\t" + "eor %[data], r5, r6\n\t" + "eor %[sha256], %[sha256], r4, ror 11\n\t" + "and %[data], %[data], r4\n\t" + "eor %[sha256], %[sha256], r4, ror 25\n\t" + "eor %[data], %[data], r6\n\t" + "add r7, r7, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + "ldr %[sha256], [r12, #40]\n\t" + "add r7, r7, r10\n\t" + "add r7, r7, %[sha256]\n\t" + "add r3, r3, r7\n\t" + "ror %[sha256], r8, #2\n\t" + "eor %[data], r8, r9\n\t" + "eor %[sha256], %[sha256], r8, ror 13\n\t" + "eor r10, r9, %[len]\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r8, ror 22\n\t" + "eor %[data], %[data], r9\n\t" + "add r7, r7, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + /* Round 11 */ + "vmov r10, d5[1]\n\t" + "ror %[sha256], r3, #6\n\t" + "eor %[data], r4, r5\n\t" + "eor %[sha256], %[sha256], r3, ror 11\n\t" + "and %[data], %[data], r3\n\t" + "eor %[sha256], %[sha256], r3, ror 25\n\t" + "eor %[data], %[data], r5\n\t" + "add r6, r6, %[sha256]\n\t" + "add r6, r6, %[data]\n\t" + "ldr %[sha256], [r12, #44]\n\t" + "add r6, r6, r10\n\t" + "add r6, r6, %[sha256]\n\t" + "add %[len], %[len], r6\n\t" + "ror %[sha256], r7, #2\n\t" + "eor %[data], r7, r8\n\t" + "eor %[sha256], %[sha256], r7, ror 13\n\t" + "eor r10, r8, r9\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r7, ror 22\n\t" + "eor %[data], %[data], r8\n\t" + "add r6, r6, %[sha256]\n\t" + "add r6, r6, %[data]\n\t" + /* Round 12 */ + "vmov r10, d6[0]\n\t" + "ror %[sha256], %[len], #6\n\t" + "eor %[data], r3, r4\n\t" + "eor %[sha256], %[sha256], %[len], ror 11\n\t" + "and %[data], %[data], %[len]\n\t" + "eor %[sha256], %[sha256], %[len], ror 25\n\t" + "eor %[data], %[data], r4\n\t" + "add r5, r5, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + "ldr %[sha256], [r12, #48]\n\t" + "add r5, r5, r10\n\t" + "add r5, r5, %[sha256]\n\t" + "add r9, r9, r5\n\t" + "ror %[sha256], r6, #2\n\t" + "eor %[data], r6, r7\n\t" + "eor %[sha256], %[sha256], r6, ror 13\n\t" + "eor r10, r7, r8\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r6, ror 22\n\t" + "eor %[data], %[data], r7\n\t" + "add r5, r5, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + /* Round 13 */ + "vmov r10, d6[1]\n\t" + "ror %[sha256], r9, #6\n\t" + "eor %[data], %[len], r3\n\t" + "eor %[sha256], %[sha256], r9, ror 11\n\t" + "and %[data], %[data], r9\n\t" + "eor %[sha256], %[sha256], r9, ror 25\n\t" + "eor %[data], %[data], r3\n\t" + "add r4, r4, %[sha256]\n\t" + "add r4, r4, %[data]\n\t" + "ldr %[sha256], [r12, #52]\n\t" + "add r4, r4, r10\n\t" + "add r4, r4, %[sha256]\n\t" + "add r8, r8, r4\n\t" + "ror %[sha256], r5, #2\n\t" + "eor %[data], r5, r6\n\t" + "eor %[sha256], %[sha256], r5, ror 13\n\t" + "eor r10, r6, r7\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r5, ror 22\n\t" + "eor %[data], %[data], r6\n\t" + "add r4, r4, %[sha256]\n\t" + "add r4, r4, %[data]\n\t" + /* Round 14 */ + "vmov r10, d7[0]\n\t" + "ror %[sha256], r8, #6\n\t" + "eor %[data], r9, %[len]\n\t" + "eor %[sha256], %[sha256], r8, ror 11\n\t" + "and %[data], %[data], r8\n\t" + "eor %[sha256], %[sha256], r8, ror 25\n\t" + "eor %[data], %[data], %[len]\n\t" + "add r3, r3, %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + "ldr %[sha256], [r12, #56]\n\t" + "add r3, r3, r10\n\t" + "add r3, r3, %[sha256]\n\t" + "add r7, r7, r3\n\t" + "ror %[sha256], r4, #2\n\t" + "eor %[data], r4, r5\n\t" + "eor %[sha256], %[sha256], r4, ror 13\n\t" + "eor r10, r5, r6\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r4, ror 22\n\t" + "eor %[data], %[data], r5\n\t" + "add r3, r3, %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + /* Round 15 */ + "vmov r10, d7[1]\n\t" + "ror %[sha256], r7, #6\n\t" + "eor %[data], r8, r9\n\t" + "eor %[sha256], %[sha256], r7, ror 11\n\t" + "and %[data], %[data], r7\n\t" + "eor %[sha256], %[sha256], r7, ror 25\n\t" + "eor %[data], %[data], r9\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add %[len], %[len], %[data]\n\t" + "ldr %[sha256], [r12, #60]\n\t" + "add %[len], %[len], r10\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add r6, r6, %[len]\n\t" + "ror %[sha256], r3, #2\n\t" + "eor %[data], r3, r4\n\t" + "eor %[sha256], %[sha256], r3, ror 13\n\t" + "eor r10, r4, r5\n\t" + "and %[data], %[data], r10\n\t" + "eor %[sha256], %[sha256], r3, ror 22\n\t" + "eor %[data], %[data], r4\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add %[len], %[len], %[data]\n\t" + "ldr r10, [sp]\n\t" + /* Add in digest from start */ + "ldrd %[sha256], %[data], [r10]\n\t" + "add %[len], %[len], %[sha256]\n\t" + "add r3, r3, %[data]\n\t" + "strd %[len], r3, [r10]\n\t" + "ldrd %[sha256], %[data], [r10, #8]\n\t" + "add r4, r4, %[sha256]\n\t" + "add r5, r5, %[data]\n\t" + "strd r4, r5, [r10, #8]\n\t" + "ldrd %[sha256], %[data], [r10, #16]\n\t" + "add r6, r6, %[sha256]\n\t" + "add r7, r7, %[data]\n\t" + "strd r6, r7, [r10, #16]\n\t" + "ldrd %[sha256], %[data], [r10, #24]\n\t" + "add r8, r8, %[sha256]\n\t" + "add r9, r9, %[data]\n\t" + "strd r8, r9, [r10, #24]\n\t" + "ldr r10, [sp, #8]\n\t" + "ldr %[data], [sp, #4]\n\t" + "subs r10, r10, #0x40\n\t" + "sub r12, r12, #0xc0\n\t" + "str r10, [sp, #8]\n\t" + "bne L_SHA256_transform_neon_len_begin_%=\n\t" + "add sp, sp, #24\n\t" + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len) + : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k), [L_SHA256_transform_neon_len_k] "r" (L_SHA256_transform_neon_len_k) + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* !NO_SHA256 */ +#endif /* !__aarch64__ */ +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S index 69067bea1..83e3aa6a5 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -225,7 +225,7 @@ Transform_Sha512_Len: strd r6, r7, [sp, #176] strd r8, r9, [sp, #184] # Start of loop processing a block -L_sha512_len_neon_begin: +L_SHA512_transform_len_begin: # Load, Reverse and Store W ldr r12, [r1] ldr lr, [r1, #4] @@ -319,7 +319,7 @@ L_sha512_len_neon_begin: eor r9, r9, lr mov r10, #4 # Start of 16 rounds -L_sha512_len_neon_start: +L_SHA512_transform_len_start: # Round 0 ldr r12, [r0, #32] ldr lr, [r0, #36] @@ -2546,7 +2546,7 @@ L_sha512_len_neon_start: str lr, [sp, #124] add r3, r3, #0x80 subs r10, r10, #1 - bne L_sha512_len_neon_start + bne L_SHA512_transform_len_start # Round 0 ldr r12, [r0, #32] ldr lr, [r0, #36] @@ -4035,7 +4035,7 @@ L_sha512_len_neon_start: subs r2, r2, #0x80 sub r3, r3, #0x200 add r1, r1, #0x80 - bne L_sha512_len_neon_begin + bne L_SHA512_transform_len_begin eor r0, r0, r0 add sp, sp, #0xc0 pop {r4, r5, r6, r7, r8, r9, r10, pc} @@ -4216,7 +4216,7 @@ Transform_Sha512_Len: # Load digest into working vars vldm.64 r0, {d0-d7} # Start of loop processing a block -L_sha512_len_neon_begin: +L_SHA512_transform_neon_len_begin: # Load W vldm.64 r1!, {d16-d31} vrev64.8 q8, q8 @@ -4230,7 +4230,7 @@ L_sha512_len_neon_begin: adr r3, L_SHA512_transform_neon_len_k mov r12, #4 # Start of 16 rounds -L_sha512_len_neon_start: +L_SHA512_transform_neon_len_start: # Round 0 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d4, #50 @@ -4856,7 +4856,7 @@ L_sha512_len_neon_start: veor q5, q6 vadd.i64 q15, q5 subs r12, r12, #1 - bne L_sha512_len_neon_start + bne L_SHA512_transform_neon_len_start # Round 0 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d4, #50 @@ -5329,7 +5329,7 @@ L_sha512_len_neon_start: vadd.i64 q3, q3, q7 vstm.64 r0, {d0-d7} subs r2, r2, #0x80 - bne L_sha512_len_neon_begin + bne L_SHA512_transform_neon_len_begin vpop {d8-d15} bx lr .size Transform_Sha512_Len,.-Transform_Sha512_Len diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index b67dae8bd..103ad4fa1 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -120,7 +120,7 @@ static const uint64_t L_SHA512_transform_len_k[] = { 0x6c44198c4a475817UL, }; -void Transform_Sha512_Len(); +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) { __asm__ __volatile__ ( @@ -145,7 +145,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r8, r9, [sp, #184]\n\t" /* Start of loop processing a block */ "\n" - "L_sha512_len_neon_begin_%=: \n\t" + "L_SHA512_transform_len_begin_%=: \n\t" /* Load, Reverse and Store W */ "ldrd r12, lr, [%[data]]\n\t" "ldrd r4, r5, [%[data], #8]\n\t" @@ -235,7 +235,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "mov r10, #4\n\t" /* Start of 16 rounds */ "\n" - "L_sha512_len_neon_start_%=: \n\t" + "L_SHA512_transform_len_start_%=: \n\t" /* Round 0 */ "ldrd r12, lr, [%[sha512], #32]\n\t" "lsrs r4, r12, #14\n\t" @@ -2222,7 +2222,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r12, lr, [sp, #120]\n\t" "add r3, r3, #0x80\n\t" "subs r10, r10, #1\n\t" - "bne L_sha512_len_neon_start_%=\n\t" + "bne L_SHA512_transform_len_start_%=\n\t" /* Round 0 */ "ldrd r12, lr, [%[sha512], #32]\n\t" "lsrs r4, r12, #14\n\t" @@ -3555,7 +3555,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "subs %[len], %[len], #0x80\n\t" "sub r3, r3, #0x200\n\t" "add %[data], %[data], #0x80\n\t" - "bne L_sha512_len_neon_begin_%=\n\t" + "bne L_SHA512_transform_len_begin_%=\n\t" "eor r0, r0, r0\n\t" "add sp, sp, #0xc0\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) @@ -3659,7 +3659,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vldm.64 %[sha512], {d0-d7}\n\t" /* Start of loop processing a block */ "\n" - "L_sha512_len_neon_begin_%=: \n\t" + "L_SHA512_transform_neon_len_begin_%=: \n\t" /* Load W */ "vldm.64 %[data]!, {d16-d31}\n\t" "vrev64.8 q8, q8\n\t" @@ -3674,7 +3674,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "mov r12, #4\n\t" /* Start of 16 rounds */ "\n" - "L_sha512_len_neon_start_%=: \n\t" + "L_SHA512_transform_neon_len_start_%=: \n\t" /* Round 0 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d4, #50\n\t" @@ -4300,7 +4300,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "veor q5, q6\n\t" "vadd.i64 q15, q5\n\t" "subs r12, r12, #1\n\t" - "bne L_sha512_len_neon_start_%=\n\t" + "bne L_SHA512_transform_neon_len_start_%=\n\t" /* Round 0 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d4, #50\n\t" @@ -4773,7 +4773,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 q3, q3, q7\n\t" "vstm.64 %[sha512], {d0-d7}\n\t" "subs %[len], %[len], #0x80\n\t" - "bne L_sha512_len_neon_begin_%=\n\t" + "bne L_SHA512_transform_neon_len_begin_%=\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) : [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k) : "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index 48158f1a6..0056a3ccb 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -32,7 +32,8 @@ #include -#if !defined(NO_AES) && defined(WOLFSSL_ARMASM) +#if !defined(NO_AES) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_CRYPTO) #ifdef HAVE_FIPS #undef HAVE_FIPS diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index 139d3a734..4109dd19f 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -45,6 +45,7 @@ #endif +#ifndef WOLFSSL_ARMASM_NO_CRYPTO static const ALIGN32 word32 K[64] = { 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, @@ -60,6 +61,7 @@ static const ALIGN32 word32 K[64] = { 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L }; +#endif static int InitSha256(wc_Sha256* sha256) @@ -94,6 +96,8 @@ static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len) } +#ifndef WOLFSSL_ARMASM_NO_CRYPTO + #ifdef __aarch64__ /* First block is in sha256->buffer and rest in data. */ @@ -1306,6 +1310,109 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash) #endif /* __aarch64__ */ +#else + +extern void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, + word32 len); + +/* ARMv8 hardware acceleration Aarch32 */ +static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) +{ + int ret = 0; + /* do block size increments */ + byte* local = (byte*)sha256->buffer; + word32 blocksLen; + + /* check that internal buffLen is valid */ + if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE) + return BUFFER_E; + + AddLength(sha256, len); + + if (sha256->buffLen > 0) { + word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen); + if (add > 0) { + XMEMCPY(&local[sha256->buffLen], data, add); + + sha256->buffLen += add; + data += add; + len -= add; + } + + if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) { + Transform_Sha256_Len(sha256, (const byte*)sha256->buffer, + WC_SHA256_BLOCK_SIZE); + sha256->buffLen = 0; + } + } + + blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); + if (blocksLen > 0) { + /* Byte reversal performed in function if required. */ + Transform_Sha256_Len(sha256, data, blocksLen); + data += blocksLen; + len -= blocksLen; + } + + if (len > 0) { + XMEMCPY(local, data, len); + sha256->buffLen = len; + } + + return ret; +} + +static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash) +{ + byte* local = (byte*)sha256->buffer; + + if (sha256 == NULL) { + return BAD_FUNC_ARG; + } + + local[sha256->buffLen++] = 0x80; /* add 1 */ + + /* pad with zeros */ + if (sha256->buffLen > WC_SHA256_PAD_SIZE) { + XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_BLOCK_SIZE - + sha256->buffLen); + sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen; + Transform_Sha256_Len(sha256, (const byte*)sha256->buffer, + WC_SHA256_BLOCK_SIZE); + + sha256->buffLen = 0; + } + XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen); + + /* put lengths in bits */ + sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) + + (sha256->hiLen << 3); + sha256->loLen = sha256->loLen << 3; + + /* store lengths */ + /* ! length ordering dependent on digest endian type ! */ + + sha256->buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32) - 2] = sha256->hiLen; + sha256->buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32) - 1] = sha256->loLen; + + ByteReverseWords( + &(sha256->buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32) - 2]), + &(sha256->buffer[WC_SHA256_BLOCK_SIZE / sizeof(word32) - 2]), + WC_SHA256_BLOCK_SIZE - WC_SHA256_PAD_SIZE); + Transform_Sha256_Len(sha256, (const byte*)sha256->buffer, + WC_SHA256_BLOCK_SIZE); + +#ifdef LITTLE_ENDIAN_ORDER + ByteReverseWords((word32*)hash, sha256->digest, WC_SHA256_DIGEST_SIZE); +#else + XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); +#endif + + return 0; +} + +#endif /* !WOLFSSL_ARMASM_NO_CRYPTO */ + #ifndef NO_SHA256 From 7d67ffac69bfd13d0069bc6fa3089dea6537b2e6 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 24 Aug 2022 14:24:17 +1000 Subject: [PATCH 2/9] Fixup assembly to compile with ARMv7a --- configure.ac | 4 +- wolfcrypt/src/port/arm/armv8-32-curve25519.S | 2258 ++++++++--------- .../src/port/arm/armv8-32-curve25519_c.c | 1994 +++++++-------- wolfcrypt/src/port/arm/armv8-chacha.c | 13 +- wolfcrypt/src/port/arm/armv8-poly1305.c | 1 + wolfcrypt/src/port/arm/armv8-sha256.c | 4 + wolfcrypt/src/sha256.c | 1 + wolfcrypt/test/test.c | 14 +- 8 files changed, 2018 insertions(+), 2271 deletions(-) diff --git a/configure.ac b/configure.ac index c49e60fba..739617392 100644 --- a/configure.ac +++ b/configure.ac @@ -2081,11 +2081,11 @@ then AC_MSG_NOTICE([64bit ARMv8 found, setting mcpu to generic+crypto]) ;; armv7a) - AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon-vfpv3 -DWOLFSSL_ARMASM_NO_CRYPTO" + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon -DWOLFSSL_ARMASM_NO_CRYPTO -DWOLFSSL_ARM_ARCH=7" # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" ENABLED_ARMASM_CRYPTO=no - AC_MSG_NOTICE([32bit ARMv7-a found, setting mfpu to neon-vfpv3]) + AC_MSG_NOTICE([32bit ARMv7-a found, setting mfpu to neon]) ;; *) AM_CPPFLAGS="$AM_CPPFLAGS -mfpu=crypto-neon-fp-armv8" diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S index b9382c7c3..156116b3a 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -3927,59 +3927,47 @@ fe_ge_dbl: ldr r2, [sp, #56] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r1, [sp, #4] ldr r0, [sp, #12] bl fe_sq @@ -3989,188 +3977,164 @@ fe_ge_dbl: # Add-Sub # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r2] - ldr r6, [r2, #4] - adds r7, r3, r5 + ldr r5, [r1, #4] + ldrd r6, r7, [r2] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r1, #8] - ldr r4, [r1, #12] - ldr r5, [r2, #8] - ldr r6, [r2, #12] + ldr r5, [r1, #12] + ldrd r6, r7, [r2, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r2, #16] - ldr r6, [r2, #20] + ldr r5, [r1, #20] + ldrd r6, r7, [r2, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r1, #24] - ldr r4, [r1, #28] - ldr r5, [r2, #24] - ldr r6, [r2, #28] + ldr r5, [r1, #28] + ldrd r6, r7, [r2, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp] ldr r1, [sp, #12] ldr r2, [sp, #4] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r1, [sp, #60] ldr r0, [sp, #12] bl fe_sq2 @@ -4178,59 +4142,47 @@ fe_ge_dbl: ldr r1, [sp, #8] # Sub ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] - ldr r7, [r1] - ldr r8, [r1, #4] - ldr r9, [r1, #8] - ldr r10, [r1, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] + ldrd r8, r9, [r1] + ldrd r10, r11, [r1, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r0, #16] - ldr r4, [r0, #20] - ldr r5, [r0, #24] - ldr r6, [r0, #28] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r0, #20] + ldrd r6, r7, [r0, #24] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] add sp, sp, #16 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_dbl,.-fe_ge_dbl @@ -4250,117 +4202,93 @@ fe_ge_madd: ldr r2, [sp, #68] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp, #72] ldr r2, [sp, #68] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r2, [sp, #88] ldr r1, [sp] ldr r0, [sp, #8] @@ -4379,300 +4307,270 @@ fe_ge_madd: # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp, #8] ldr r1, [sp, #76] # Double ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] adds r3, r3, r3 - adcs r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 - adc r10, r10, r10 + adcs r10, r10, r10 + adc r11, r11, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #8] ldr r1, [sp, #12] # Add-Sub # Add ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r1] - ldr r6, [r1, #4] - adds r7, r3, r5 + ldr r5, [r0, #4] + ldrd r6, r7, [r1] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r0, #8] - ldr r4, [r0, #12] - ldr r5, [r1, #8] - ldr r6, [r1, #12] + ldr r5, [r0, #12] + ldrd r6, r7, [r1, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r0, #16] - ldr r4, [r0, #20] - ldr r5, [r1, #16] - ldr r6, [r1, #20] + ldr r5, [r0, #20] + ldrd r6, r7, [r1, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r0, #24] - ldr r4, [r0, #28] - ldr r5, [r1, #24] - ldr r6, [r1, #28] + ldr r5, [r0, #28] + ldrd r6, r7, [r1, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] add sp, sp, #32 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_madd,.-fe_ge_madd @@ -4692,117 +4590,93 @@ fe_ge_msub: ldr r2, [sp, #68] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp, #72] ldr r2, [sp, #68] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r2, [sp, #92] ldr r1, [sp] ldr r0, [sp, #8] @@ -4821,300 +4695,270 @@ fe_ge_msub: # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp, #8] ldr r1, [sp, #76] # Double ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] adds r3, r3, r3 - adcs r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 - adc r10, r10, r10 + adcs r10, r10, r10 + adc r11, r11, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #12] ldr r1, [sp, #8] # Add-Sub # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r1, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r1, #8] - ldr r4, [r1, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r1, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r1, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r1, #24] - ldr r4, [r1, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r1, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] add sp, sp, #32 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_msub,.-fe_ge_msub @@ -5134,117 +4978,93 @@ fe_ge_add: ldr r2, [sp, #132] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp, #136] ldr r2, [sp, #132] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r2, [sp, #156] ldr r1, [sp] ldr r0, [sp, #8] @@ -5265,303 +5085,273 @@ fe_ge_add: ldr r1, [sp] # Double ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] adds r3, r3, r3 - adcs r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 - adc r10, r10, r10 + adcs r10, r10, r10 + adc r11, r11, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp] ldr r2, [sp, #8] # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp, #8] ldr r1, [sp, #12] add r2, sp, #16 # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r1] - ldr r6, [r1, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r1] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r1, #8] - ldr r6, [r1, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r1, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r1, #16] - ldr r6, [r1, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r1, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r1, #24] - ldr r6, [r1, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r1, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] add sp, sp, #0x60 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_add,.-fe_ge_add @@ -5581,117 +5371,93 @@ fe_ge_sub: ldr r2, [sp, #132] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp, #136] ldr r2, [sp, #132] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r2, [sp, #160] ldr r1, [sp] ldr r0, [sp, #8] @@ -5712,303 +5478,273 @@ fe_ge_sub: ldr r1, [sp] # Double ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] adds r3, r3, r3 - adcs r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 - adc r10, r10, r10 + adcs r10, r10, r10 + adc r11, r11, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp] ldr r2, [sp, #8] # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp, #12] ldr r1, [sp, #8] add r2, sp, #16 # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] add sp, sp, #0x60 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_sub,.-fe_ge_sub diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 3967f1836..7a5be1771 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -3936,44 +3936,44 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr r1, [sp, #52]\n\t" "ldr r2, [sp, #56]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r1, [sp, #4]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_sq\n\t" @@ -3982,189 +3982,189 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr r2, [sp]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r2]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r2]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #8]\n\t" - "ldrd r5, r6, [r2, #8]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "ldrd r6, r7, [r2, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r2, #16]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r2, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #24]\n\t" - "ldrd r5, r6, [r2, #24]\n\t" + "ldrd %[rt], r5, [r1, #24]\n\t" + "ldrd r6, r7, [r2, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp]\n\t" "ldr r1, [sp, #12]\n\t" "ldr r2, [sp, #4]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r1, [sp, #60]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_sq2\n\t" "ldr r0, [sp, #12]\n\t" "ldr r1, [sp, #8]\n\t" /* Sub */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" - "ldrd r7, r8, [r1]\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "ldrd r8, r9, [r1]\n\t" + "ldrd r10, r11, [r1, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "add sp, sp, #16\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : @@ -4187,86 +4187,86 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr r1, [sp, #72]\n\t" "ldr r2, [sp, #68]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp, #72]\n\t" "ldr r2, [sp, #68]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r2, [sp, #88]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" @@ -4284,237 +4284,237 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr r2, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp, #8]\n\t" "ldr r1, [sp, #76]\n\t" /* Double */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" "adds %[rt], %[rt], %[rt]\n\t" - "adcs r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adcs r7, r7, r7\n\t" "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" - "adc r10, r10, r10\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #8]\n\t" "ldr r1, [sp, #12]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r1]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r1]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r0, #8]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r0, #16]\n\t" - "ldrd r5, r6, [r1, #16]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "ldrd r6, r7, [r1, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r0, #24]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" + "ldrd %[rt], r5, [r0, #24]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "add sp, sp, #32\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : @@ -4541,86 +4541,86 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr r1, [sp, #72]\n\t" "ldr r2, [sp, #68]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp, #72]\n\t" "ldr r2, [sp, #68]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r2, [sp, #92]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" @@ -4638,237 +4638,237 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr r2, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp, #8]\n\t" "ldr r1, [sp, #76]\n\t" /* Double */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" "adds %[rt], %[rt], %[rt]\n\t" - "adcs r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adcs r7, r7, r7\n\t" "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" - "adc r10, r10, r10\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #12]\n\t" "ldr r1, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r1, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "add sp, sp, #32\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : @@ -4895,86 +4895,86 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr r1, [sp, #136]\n\t" "ldr r2, [sp, #132]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp, #136]\n\t" "ldr r2, [sp, #132]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r2, [sp, #156]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" @@ -4994,240 +4994,240 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "add r0, sp, #16\n\t" "ldr r1, [sp]\n\t" /* Double */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" "adds %[rt], %[rt], %[rt]\n\t" - "adcs r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adcs r7, r7, r7\n\t" "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" - "adc r10, r10, r10\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp]\n\t" "ldr r2, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp, #8]\n\t" "ldr r1, [sp, #12]\n\t" "add r2, sp, #16\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r1]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r1]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r1, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r1, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "add sp, sp, #0x60\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : @@ -5255,86 +5255,86 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr r1, [sp, #136]\n\t" "ldr r2, [sp, #132]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp, #136]\n\t" "ldr r2, [sp, #132]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r2, [sp, #160]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" @@ -5354,240 +5354,240 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "add r0, sp, #16\n\t" "ldr r1, [sp]\n\t" /* Double */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" "adds %[rt], %[rt], %[rt]\n\t" - "adcs r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adcs r7, r7, r7\n\t" "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" - "adc r10, r10, r10\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp]\n\t" "ldr r2, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp, #12]\n\t" "ldr r1, [sp, #8]\n\t" "add r2, sp, #16\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "add sp, sp, #0x60\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index 7b0bd1a6c..83d242671 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -987,7 +987,12 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "VMOV d4, r8, r9 \n\t" "STRD r10, r11, %[x_10] \n\t" "VMOV d5, r10, r11 \n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 8) + "LDR r11, [r14, #4*14] \n\t" + "LDR r10, [r14, #4*15] \n\t" +#else "LDRD r11, r10, [r14, #4*14] \n\t" +#endif "VMOV q4, q0 \n\t" "VMOV q5, q1 \n\t" "VMOV q6, q2 \n\t" @@ -2754,11 +2759,9 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, /* XOR 8 bytes */ "CMP %[bytes], #8 \n\t" "BLT L_chacha20_arm32_64_lt_8_%= \n\t" - "VLDR d8, [%[m], #0] \n\t" - "ADD %[m], %[m], #8 \n\t" + "VLD1.64 { d8 }, [%[m]]! \n\t" "VEOR d8, d8, d0 \n\t" - "VSTR d8, [%[c], #0] \n\t" - "ADD %[c], %[c], #8 \n\t" + "VST1.64 { d8 }, [%[c]]! \n\t" "SUBS %[bytes], %[bytes], #8 \n\t" "VMOV d0, d1 \n\t" "BEQ L_chacha20_arm32_64_done_%= \n\t" @@ -2772,7 +2775,7 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, "EOR r12, r12, r14 \n\t" "STR r12, [%[c]], #4 \n\t" "SUBS %[bytes], %[bytes], #4 \n\t" - "VTRN.32 d0, d0 \n\t" + "VSHR.U64 d0, d0, #32 \n\t" "BEQ L_chacha20_arm32_64_done_%= \n\t" "\n" "L_chacha20_arm32_64_lt_4_%=: \n\t" diff --git a/wolfcrypt/src/port/arm/armv8-poly1305.c b/wolfcrypt/src/port/arm/armv8-poly1305.c index 637599827..5ed722bcb 100644 --- a/wolfcrypt/src/port/arm/armv8-poly1305.c +++ b/wolfcrypt/src/port/arm/armv8-poly1305.c @@ -29,6 +29,7 @@ #endif #include +#include #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index 4109dd19f..730f5c599 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -1537,7 +1537,11 @@ int wc_Sha256Transform(wc_Sha256* sha256, const unsigned char* data) #else XMEMCPY(sha256->buffer, data, WC_SHA256_BLOCK_SIZE); #endif +#ifndef WOLFSSL_ARMASM_NO_CRYPTO Sha256Transform(sha256, data, 1); +#else + Transform_Sha256_Len(sha256, data, WC_SHA256_BLOCK_SIZE); +#endif return 0; } #endif diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index dab3c97e3..fda36af7b 100644 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -43,6 +43,7 @@ on the specific device platform. #endif #include +#include /* * SHA256 Build Options: diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 4867a741d..927865c88 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -5921,8 +5921,10 @@ WOLFSSL_TEST_SUBROUTINE int chacha_test(void) return -4722; for (i = 0; i < 18; ++i) { - /* this will test all paths */ - /* block sizes: 1 2 3 4 7 8 15 16 31 32 63 64 127 128 255 256 511 512 */ + /* this will test all paths + * block sizes: 1 3 7 15 31 63 127 255 511 (i = 0- 8) + * 2 4 8 16 32 64 128 256 512 (i = 9-17) + */ block_size = (2 << (i%9)) - (i<9?1:0); keySz = 32; @@ -5936,16 +5938,16 @@ WOLFSSL_TEST_SUBROUTINE int chacha_test(void) if (ret != 0) return ret; - ret |= wc_Chacha_Process(&enc, cipher_big, plain_big, block_size); - ret |= wc_Chacha_Process(&dec, plain_big, cipher_big, block_size); + ret |= wc_Chacha_Process(&enc, cipher_big, plain_big , block_size); + ret |= wc_Chacha_Process(&dec, plain_big , cipher_big, block_size); if (ret != 0) return ret; if (XMEMCMP(plain_big, input_big, block_size)) - return -4723-i; + return -4740-i*2; if (XMEMCMP(cipher_big, cipher_big_result, block_size)) - return -4724-i; + return -4741-i*2; } /* Streaming test */ From 5dc1732036ddcee4f9ad353f735a3dd2defc5908 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Thu, 8 Sep 2022 08:39:00 +1000 Subject: [PATCH 3/9] Fix define name WOLFSSL_ARMASM_NO_CRYPTO -> WOLFSSL_ARMASM_NO_HW_CRYPTO --- configure.ac | 2 +- wolfcrypt/src/aes.c | 6 +++--- wolfcrypt/src/port/arm/armv8-aes.c | 2 +- wolfcrypt/src/port/arm/armv8-sha256.c | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/configure.ac b/configure.ac index 739617392..a8cc754f2 100644 --- a/configure.ac +++ b/configure.ac @@ -2081,7 +2081,7 @@ then AC_MSG_NOTICE([64bit ARMv8 found, setting mcpu to generic+crypto]) ;; armv7a) - AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon -DWOLFSSL_ARMASM_NO_CRYPTO -DWOLFSSL_ARM_ARCH=7" + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon -DWOLFSSL_ARMASM_NO_HW_CRYPTO -DWOLFSSL_ARM_ARCH=7" # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" ENABLED_ARMASM_CRYPTO=no diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index e8fa52d22..952deea4f 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -306,7 +306,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits #include #endif -#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_ARMASM_NO_CRYPTO) +#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) #ifdef WOLFSSL_IMX6_CAAM_BLOB /* case of possibly not using hardware acceleration for AES but using key @@ -4601,7 +4601,7 @@ static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz) #endif -#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_CRYPTO) +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) /* implementation is located in wolfcrypt/src/port/arm/armv8-aes.c */ #elif defined(WOLFSSL_AFALG) @@ -9933,7 +9933,7 @@ int wc_AesCcmCheckTagSize(int sz) return 0; } -#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_CRYPTO) +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) /* implementation located in wolfcrypt/src/port/arm/armv8-aes.c */ #elif defined(HAVE_COLDFIRE_SEC) diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index 0056a3ccb..49e78606f 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -33,7 +33,7 @@ #include #if !defined(NO_AES) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_CRYPTO) + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) #ifdef HAVE_FIPS #undef HAVE_FIPS diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index 730f5c599..ca26e4423 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -45,7 +45,7 @@ #endif -#ifndef WOLFSSL_ARMASM_NO_CRYPTO +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO static const ALIGN32 word32 K[64] = { 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, @@ -96,7 +96,7 @@ static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len) } -#ifndef WOLFSSL_ARMASM_NO_CRYPTO +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO #ifdef __aarch64__ @@ -1411,7 +1411,7 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash) return 0; } -#endif /* !WOLFSSL_ARMASM_NO_CRYPTO */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #ifndef NO_SHA256 @@ -1537,7 +1537,7 @@ int wc_Sha256Transform(wc_Sha256* sha256, const unsigned char* data) #else XMEMCPY(sha256->buffer, data, WC_SHA256_BLOCK_SIZE); #endif -#ifndef WOLFSSL_ARMASM_NO_CRYPTO +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO Sha256Transform(sha256, data, 1); #else Transform_Sha256_Len(sha256, data, WC_SHA256_BLOCK_SIZE); From 0db0032b318190b6fe7564048ff1a5083a838174 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 9 Sep 2022 10:19:17 +1000 Subject: [PATCH 4/9] ARM32 ASM: vrev not always available Provide alternative assembly instructions to vrev when WOLFSSL_ARM_ARCH_NO_VREV is defined. --- wolfcrypt/src/port/arm/armv8-32-sha256-asm.S | 21 +++++++- .../src/port/arm/armv8-32-sha256-asm_c.c | 21 +++++++- wolfcrypt/src/port/arm/armv8-32-sha512-asm.S | 51 +++++++++++++++++++ .../src/port/arm/armv8-32-sha512-asm_c.c | 51 +++++++++++++++++++ 4 files changed, 142 insertions(+), 2 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S index 6132aac4a..446d3877c 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S @@ -1,6 +1,6 @@ /* armv8-32-sha256-asm * - * Copyright (C) 2006-2021 wolfSSL Inc. + * Copyright (C) 2006-2022 wolfSSL Inc. * * This file is part of wolfSSL. * @@ -1559,10 +1559,29 @@ Transform_Sha256_Len: L_SHA256_transform_neon_len_begin: # Load W vldm.32 r1!, {d0-d7} +#ifndef WOLFSSL_ARM_ARCH_NO_VREV vrev32.8 q0, q0 vrev32.8 q1, q1 vrev32.8 q2, q2 vrev32.8 q3, q3 +#else + vshl.i16 q4, q0, #8 + vshl.i16 q5, q1, #8 + vsri.i16 q4, q0, #8 + vsri.i16 q5, q1, #8 + vshl.i32 q0, q4, #16 + vshl.i32 q1, q5, #16 + vsri.i32 q0, q4, #16 + vsri.i32 q1, q5, #16 + vshl.i16 q4, q2, #8 + vshl.i16 q5, q3, #8 + vsri.i16 q4, q2, #8 + vsri.i16 q5, q3, #8 + vshl.i32 q2, q4, #16 + vshl.i32 q3, q5, #16 + vsri.i32 q2, q4, #16 + vsri.i32 q3, q5, #16 +#endif /* WOLFSSL_ARM_ARCH_NO_VREV */ str r1, [sp, #4] mov lr, #3 # Start of 16 rounds diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c index 73a53027c..5cd910b2a 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -1,6 +1,6 @@ /* armv8-32-sha256-asm * - * Copyright (C) 2006-2021 wolfSSL Inc. + * Copyright (C) 2006-2022 wolfSSL Inc. * * This file is part of wolfSSL. * @@ -1554,10 +1554,29 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "L_SHA256_transform_neon_len_begin_%=: \n\t" /* Load W */ "vldm.32 %[data]!, {d0-d7}\n\t" +#ifndef WOLFSSL_ARM_ARCH_NO_VREV "vrev32.8 q0, q0\n\t" "vrev32.8 q1, q1\n\t" "vrev32.8 q2, q2\n\t" "vrev32.8 q3, q3\n\t" +#else + "vshl.i16 q4, q0, #8\n\t" + "vshl.i16 q5, q1, #8\n\t" + "vsri.i16 q4, q0, #8\n\t" + "vsri.i16 q5, q1, #8\n\t" + "vshl.i32 q0, q4, #16\n\t" + "vshl.i32 q1, q5, #16\n\t" + "vsri.i32 q0, q4, #16\n\t" + "vsri.i32 q1, q5, #16\n\t" + "vshl.i16 q4, q2, #8\n\t" + "vshl.i16 q5, q3, #8\n\t" + "vsri.i16 q4, q2, #8\n\t" + "vsri.i16 q5, q3, #8\n\t" + "vshl.i32 q2, q4, #16\n\t" + "vshl.i32 q3, q5, #16\n\t" + "vsri.i32 q2, q4, #16\n\t" + "vsri.i32 q3, q5, #16\n\t" +#endif /* WOLFSSL_ARM_ARCH_NO_VREV */ "str %[data], [sp, #4]\n\t" "mov lr, #3\n\t" /* Start of 16 rounds */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S index 83e3aa6a5..4cb888a23 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -4219,6 +4219,7 @@ Transform_Sha512_Len: L_SHA512_transform_neon_len_begin: # Load W vldm.64 r1!, {d16-d31} +#ifndef WOLFSSL_ARM_ARCH_NO_VREV vrev64.8 q8, q8 vrev64.8 q9, q9 vrev64.8 q10, q10 @@ -4227,6 +4228,56 @@ L_SHA512_transform_neon_len_begin: vrev64.8 q13, q13 vrev64.8 q14, q14 vrev64.8 q15, q15 +#else + vshl.i16 q4, q8, #8 + vshl.i16 q5, q9, #8 + vsri.i16 q4, q8, #8 + vsri.i16 q5, q9, #8 + vshl.i32 q6, q4, #16 + vshl.i32 q7, q5, #16 + vsri.i32 q6, q4, #16 + vsri.i32 q7, q5, #16 + vshl.i64 q8, q6, #32 + vshl.i64 q9, q7, #32 + vsri.i64 q8, q6, #32 + vsri.i64 q9, q7, #32 + vshl.i16 q4, q10, #8 + vshl.i16 q5, q11, #8 + vsri.i16 q4, q10, #8 + vsri.i16 q5, q11, #8 + vshl.i32 q6, q4, #16 + vshl.i32 q7, q5, #16 + vsri.i32 q6, q4, #16 + vsri.i32 q7, q5, #16 + vshl.i64 q10, q6, #32 + vshl.i64 q11, q7, #32 + vsri.i64 q10, q6, #32 + vsri.i64 q11, q7, #32 + vshl.i16 q4, q12, #8 + vshl.i16 q5, q13, #8 + vsri.i16 q4, q12, #8 + vsri.i16 q5, q13, #8 + vshl.i32 q6, q4, #16 + vshl.i32 q7, q5, #16 + vsri.i32 q6, q4, #16 + vsri.i32 q7, q5, #16 + vshl.i64 q12, q6, #32 + vshl.i64 q13, q7, #32 + vsri.i64 q12, q6, #32 + vsri.i64 q13, q7, #32 + vshl.i16 q4, q14, #8 + vshl.i16 q5, q15, #8 + vsri.i16 q4, q14, #8 + vsri.i16 q5, q15, #8 + vshl.i32 q6, q4, #16 + vshl.i32 q7, q5, #16 + vsri.i32 q6, q4, #16 + vsri.i32 q7, q5, #16 + vshl.i64 q14, q6, #32 + vshl.i64 q15, q7, #32 + vsri.i64 q14, q6, #32 + vsri.i64 q15, q7, #32 +#endif /* WOLFSSL_ARM_ARCH_NO_VREV */ adr r3, L_SHA512_transform_neon_len_k mov r12, #4 # Start of 16 rounds diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index 103ad4fa1..4d5eb18c1 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -3662,6 +3662,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "L_SHA512_transform_neon_len_begin_%=: \n\t" /* Load W */ "vldm.64 %[data]!, {d16-d31}\n\t" +#ifndef WOLFSSL_ARM_ARCH_NO_VREV "vrev64.8 q8, q8\n\t" "vrev64.8 q9, q9\n\t" "vrev64.8 q10, q10\n\t" @@ -3670,6 +3671,56 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vrev64.8 q13, q13\n\t" "vrev64.8 q14, q14\n\t" "vrev64.8 q15, q15\n\t" +#else + "vshl.i16 q4, q8, #8\n\t" + "vshl.i16 q5, q9, #8\n\t" + "vsri.i16 q4, q8, #8\n\t" + "vsri.i16 q5, q9, #8\n\t" + "vshl.i32 q6, q4, #16\n\t" + "vshl.i32 q7, q5, #16\n\t" + "vsri.i32 q6, q4, #16\n\t" + "vsri.i32 q7, q5, #16\n\t" + "vshl.i64 q8, q6, #32\n\t" + "vshl.i64 q9, q7, #32\n\t" + "vsri.i64 q8, q6, #32\n\t" + "vsri.i64 q9, q7, #32\n\t" + "vshl.i16 q4, q10, #8\n\t" + "vshl.i16 q5, q11, #8\n\t" + "vsri.i16 q4, q10, #8\n\t" + "vsri.i16 q5, q11, #8\n\t" + "vshl.i32 q6, q4, #16\n\t" + "vshl.i32 q7, q5, #16\n\t" + "vsri.i32 q6, q4, #16\n\t" + "vsri.i32 q7, q5, #16\n\t" + "vshl.i64 q10, q6, #32\n\t" + "vshl.i64 q11, q7, #32\n\t" + "vsri.i64 q10, q6, #32\n\t" + "vsri.i64 q11, q7, #32\n\t" + "vshl.i16 q4, q12, #8\n\t" + "vshl.i16 q5, q13, #8\n\t" + "vsri.i16 q4, q12, #8\n\t" + "vsri.i16 q5, q13, #8\n\t" + "vshl.i32 q6, q4, #16\n\t" + "vshl.i32 q7, q5, #16\n\t" + "vsri.i32 q6, q4, #16\n\t" + "vsri.i32 q7, q5, #16\n\t" + "vshl.i64 q12, q6, #32\n\t" + "vshl.i64 q13, q7, #32\n\t" + "vsri.i64 q12, q6, #32\n\t" + "vsri.i64 q13, q7, #32\n\t" + "vshl.i16 q4, q14, #8\n\t" + "vshl.i16 q5, q15, #8\n\t" + "vsri.i16 q4, q14, #8\n\t" + "vsri.i16 q5, q15, #8\n\t" + "vshl.i32 q6, q4, #16\n\t" + "vshl.i32 q7, q5, #16\n\t" + "vsri.i32 q6, q4, #16\n\t" + "vsri.i32 q7, q5, #16\n\t" + "vshl.i64 q14, q6, #32\n\t" + "vshl.i64 q15, q7, #32\n\t" + "vsri.i64 q14, q6, #32\n\t" + "vsri.i64 q15, q7, #32\n\t" +#endif /* WOLFSSL_ARM_ARCH_NO_VREV */ "mov r3, %[L_SHA512_transform_neon_len_k]\n\t" "mov r12, #4\n\t" /* Start of 16 rounds */ From 2c4c7ba6dad7b286fb14e9cf37c6bfec02f0d890 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 12 Sep 2022 10:00:18 +1000 Subject: [PATCH 5/9] ARM v7a ASM: 128-bit registers not supported Cortex-A5 - Cortex-A9 only support 64-bit wide NEON. Remove use of WOLFSSL_ARM_ARCH_NO_VREV. Use WOLFSSL_ARM_ARCH_NEON_64BIT to indicate to use 64-bit NEON registers and not 128-bit NEON registers. --- wolfcrypt/src/port/arm/armv8-32-sha256-asm.S | 28 +- .../src/port/arm/armv8-32-sha256-asm_c.c | 28 +- wolfcrypt/src/port/arm/armv8-32-sha512-asm.S | 415 +++++++++++++++--- .../src/port/arm/armv8-32-sha512-asm_c.c | 415 +++++++++++++++--- 4 files changed, 750 insertions(+), 136 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S index 446d3877c..e705558a9 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S @@ -1559,29 +1559,21 @@ Transform_Sha256_Len: L_SHA256_transform_neon_len_begin: # Load W vldm.32 r1!, {d0-d7} -#ifndef WOLFSSL_ARM_ARCH_NO_VREV +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT vrev32.8 q0, q0 vrev32.8 q1, q1 vrev32.8 q2, q2 vrev32.8 q3, q3 #else - vshl.i16 q4, q0, #8 - vshl.i16 q5, q1, #8 - vsri.i16 q4, q0, #8 - vsri.i16 q5, q1, #8 - vshl.i32 q0, q4, #16 - vshl.i32 q1, q5, #16 - vsri.i32 q0, q4, #16 - vsri.i32 q1, q5, #16 - vshl.i16 q4, q2, #8 - vshl.i16 q5, q3, #8 - vsri.i16 q4, q2, #8 - vsri.i16 q5, q3, #8 - vshl.i32 q2, q4, #16 - vshl.i32 q3, q5, #16 - vsri.i32 q2, q4, #16 - vsri.i32 q3, q5, #16 -#endif /* WOLFSSL_ARM_ARCH_NO_VREV */ + vrev32.8 d0, d0 + vrev32.8 d1, d1 + vrev32.8 d2, d2 + vrev32.8 d3, d3 + vrev32.8 d4, d4 + vrev32.8 d5, d5 + vrev32.8 d6, d6 + vrev32.8 d7, d7 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ str r1, [sp, #4] mov lr, #3 # Start of 16 rounds diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c index 5cd910b2a..06ea84e1f 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -1554,29 +1554,21 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "L_SHA256_transform_neon_len_begin_%=: \n\t" /* Load W */ "vldm.32 %[data]!, {d0-d7}\n\t" -#ifndef WOLFSSL_ARM_ARCH_NO_VREV +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT "vrev32.8 q0, q0\n\t" "vrev32.8 q1, q1\n\t" "vrev32.8 q2, q2\n\t" "vrev32.8 q3, q3\n\t" #else - "vshl.i16 q4, q0, #8\n\t" - "vshl.i16 q5, q1, #8\n\t" - "vsri.i16 q4, q0, #8\n\t" - "vsri.i16 q5, q1, #8\n\t" - "vshl.i32 q0, q4, #16\n\t" - "vshl.i32 q1, q5, #16\n\t" - "vsri.i32 q0, q4, #16\n\t" - "vsri.i32 q1, q5, #16\n\t" - "vshl.i16 q4, q2, #8\n\t" - "vshl.i16 q5, q3, #8\n\t" - "vsri.i16 q4, q2, #8\n\t" - "vsri.i16 q5, q3, #8\n\t" - "vshl.i32 q2, q4, #16\n\t" - "vshl.i32 q3, q5, #16\n\t" - "vsri.i32 q2, q4, #16\n\t" - "vsri.i32 q3, q5, #16\n\t" -#endif /* WOLFSSL_ARM_ARCH_NO_VREV */ + "vrev32.8 d0, d0\n\t" + "vrev32.8 d1, d1\n\t" + "vrev32.8 d2, d2\n\t" + "vrev32.8 d3, d3\n\t" + "vrev32.8 d4, d4\n\t" + "vrev32.8 d5, d5\n\t" + "vrev32.8 d6, d6\n\t" + "vrev32.8 d7, d7\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ "str %[data], [sp, #4]\n\t" "mov lr, #3\n\t" /* Start of 16 rounds */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S index 4cb888a23..7d4dcdc26 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -4219,7 +4219,7 @@ Transform_Sha512_Len: L_SHA512_transform_neon_len_begin: # Load W vldm.64 r1!, {d16-d31} -#ifndef WOLFSSL_ARM_ARCH_NO_VREV +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT vrev64.8 q8, q8 vrev64.8 q9, q9 vrev64.8 q10, q10 @@ -4229,55 +4229,23 @@ L_SHA512_transform_neon_len_begin: vrev64.8 q14, q14 vrev64.8 q15, q15 #else - vshl.i16 q4, q8, #8 - vshl.i16 q5, q9, #8 - vsri.i16 q4, q8, #8 - vsri.i16 q5, q9, #8 - vshl.i32 q6, q4, #16 - vshl.i32 q7, q5, #16 - vsri.i32 q6, q4, #16 - vsri.i32 q7, q5, #16 - vshl.i64 q8, q6, #32 - vshl.i64 q9, q7, #32 - vsri.i64 q8, q6, #32 - vsri.i64 q9, q7, #32 - vshl.i16 q4, q10, #8 - vshl.i16 q5, q11, #8 - vsri.i16 q4, q10, #8 - vsri.i16 q5, q11, #8 - vshl.i32 q6, q4, #16 - vshl.i32 q7, q5, #16 - vsri.i32 q6, q4, #16 - vsri.i32 q7, q5, #16 - vshl.i64 q10, q6, #32 - vshl.i64 q11, q7, #32 - vsri.i64 q10, q6, #32 - vsri.i64 q11, q7, #32 - vshl.i16 q4, q12, #8 - vshl.i16 q5, q13, #8 - vsri.i16 q4, q12, #8 - vsri.i16 q5, q13, #8 - vshl.i32 q6, q4, #16 - vshl.i32 q7, q5, #16 - vsri.i32 q6, q4, #16 - vsri.i32 q7, q5, #16 - vshl.i64 q12, q6, #32 - vshl.i64 q13, q7, #32 - vsri.i64 q12, q6, #32 - vsri.i64 q13, q7, #32 - vshl.i16 q4, q14, #8 - vshl.i16 q5, q15, #8 - vsri.i16 q4, q14, #8 - vsri.i16 q5, q15, #8 - vshl.i32 q6, q4, #16 - vshl.i32 q7, q5, #16 - vsri.i32 q6, q4, #16 - vsri.i32 q7, q5, #16 - vshl.i64 q14, q6, #32 - vshl.i64 q15, q7, #32 - vsri.i64 q14, q6, #32 - vsri.i64 q15, q7, #32 -#endif /* WOLFSSL_ARM_ARCH_NO_VREV */ + vrev64.8 d16, d16 + vrev64.8 d17, d17 + vrev64.8 d18, d18 + vrev64.8 d19, d19 + vrev64.8 d20, d20 + vrev64.8 d21, d21 + vrev64.8 d22, d22 + vrev64.8 d23, d23 + vrev64.8 d24, d24 + vrev64.8 d25, d25 + vrev64.8 d26, d26 + vrev64.8 d27, d27 + vrev64.8 d28, d28 + vrev64.8 d29, d29 + vrev64.8 d30, d30 + vrev64.8 d31, d31 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ adr r3, L_SHA512_transform_neon_len_k mov r12, #4 # Start of 16 rounds @@ -4340,6 +4308,7 @@ L_SHA512_transform_neon_len_start: vadd.i64 d10, d9 vadd.i64 d2, d6 vadd.i64 d6, d10 +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT # Calc new W[0]-W[1] vext.8 q6, q8, q9, #8 vshl.u64 q4, q15, #45 @@ -4360,6 +4329,47 @@ L_SHA512_transform_neon_len_start: vshr.u64 q6, #7 veor q5, q6 vadd.i64 q8, q5 +#else + # Calc new W[0]-W[1] + vmov d12, d17 + vmov d13, d18 + vshl.u64 d8, d30, #45 + vshl.u64 d9, d31, #45 + vsri.u64 d8, d30, #19 + vsri.u64 d9, d31, #19 + vshl.u64 d10, d30, #3 + vshl.u64 d11, d31, #3 + vsri.u64 d10, d30, #61 + vsri.u64 d11, d31, #61 + veor d10, d8 + veor d11, d9 + vshr.u64 d8, d30, #6 + vshr.u64 d9, d31, #6 + veor d10, d8 + veor d11, d9 + vadd.i64 d16, d10 + vadd.i64 d17, d11 + vmov d14, d25 + vmov d15, d26 + vadd.i64 d16, d14 + vadd.i64 d17, d15 + vshl.u64 d8, d12, #63 + vshl.u64 d9, d13, #63 + vsri.u64 d8, d12, #1 + vsri.u64 d9, d13, #1 + vshl.u64 d10, d12, #56 + vshl.u64 d11, d13, #56 + vsri.u64 d10, d12, #8 + vsri.u64 d11, d13, #8 + veor d10, d8 + veor d11, d9 + vshr.u64 d12, #7 + vshr.u64 d13, #7 + veor d10, d12 + veor d11, d13 + vadd.i64 d16, d10 + vadd.i64 d17, d11 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ # Round 2 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d2, #50 @@ -4418,6 +4428,7 @@ L_SHA512_transform_neon_len_start: vadd.i64 d10, d9 vadd.i64 d0, d4 vadd.i64 d4, d10 +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT # Calc new W[2]-W[3] vext.8 q6, q9, q10, #8 vshl.u64 q4, q8, #45 @@ -4438,6 +4449,47 @@ L_SHA512_transform_neon_len_start: vshr.u64 q6, #7 veor q5, q6 vadd.i64 q9, q5 +#else + # Calc new W[2]-W[3] + vmov d12, d19 + vmov d13, d20 + vshl.u64 d8, d16, #45 + vshl.u64 d9, d17, #45 + vsri.u64 d8, d16, #19 + vsri.u64 d9, d17, #19 + vshl.u64 d10, d16, #3 + vshl.u64 d11, d17, #3 + vsri.u64 d10, d16, #61 + vsri.u64 d11, d17, #61 + veor d10, d8 + veor d11, d9 + vshr.u64 d8, d16, #6 + vshr.u64 d9, d17, #6 + veor d10, d8 + veor d11, d9 + vadd.i64 d18, d10 + vadd.i64 d19, d11 + vmov d14, d27 + vmov d15, d28 + vadd.i64 d18, d14 + vadd.i64 d19, d15 + vshl.u64 d8, d12, #63 + vshl.u64 d9, d13, #63 + vsri.u64 d8, d12, #1 + vsri.u64 d9, d13, #1 + vshl.u64 d10, d12, #56 + vshl.u64 d11, d13, #56 + vsri.u64 d10, d12, #8 + vsri.u64 d11, d13, #8 + veor d10, d8 + veor d11, d9 + vshr.u64 d12, #7 + vshr.u64 d13, #7 + veor d10, d12 + veor d11, d13 + vadd.i64 d18, d10 + vadd.i64 d19, d11 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ # Round 4 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d0, #50 @@ -4496,6 +4548,7 @@ L_SHA512_transform_neon_len_start: vadd.i64 d10, d9 vadd.i64 d6, d2 vadd.i64 d2, d10 +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT # Calc new W[4]-W[5] vext.8 q6, q10, q11, #8 vshl.u64 q4, q9, #45 @@ -4516,6 +4569,47 @@ L_SHA512_transform_neon_len_start: vshr.u64 q6, #7 veor q5, q6 vadd.i64 q10, q5 +#else + # Calc new W[4]-W[5] + vmov d12, d21 + vmov d13, d22 + vshl.u64 d8, d18, #45 + vshl.u64 d9, d19, #45 + vsri.u64 d8, d18, #19 + vsri.u64 d9, d19, #19 + vshl.u64 d10, d18, #3 + vshl.u64 d11, d19, #3 + vsri.u64 d10, d18, #61 + vsri.u64 d11, d19, #61 + veor d10, d8 + veor d11, d9 + vshr.u64 d8, d18, #6 + vshr.u64 d9, d19, #6 + veor d10, d8 + veor d11, d9 + vadd.i64 d20, d10 + vadd.i64 d21, d11 + vmov d14, d29 + vmov d15, d30 + vadd.i64 d20, d14 + vadd.i64 d21, d15 + vshl.u64 d8, d12, #63 + vshl.u64 d9, d13, #63 + vsri.u64 d8, d12, #1 + vsri.u64 d9, d13, #1 + vshl.u64 d10, d12, #56 + vshl.u64 d11, d13, #56 + vsri.u64 d10, d12, #8 + vsri.u64 d11, d13, #8 + veor d10, d8 + veor d11, d9 + vshr.u64 d12, #7 + vshr.u64 d13, #7 + veor d10, d12 + veor d11, d13 + vadd.i64 d20, d10 + vadd.i64 d21, d11 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ # Round 6 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d6, #50 @@ -4574,6 +4668,7 @@ L_SHA512_transform_neon_len_start: vadd.i64 d10, d9 vadd.i64 d4, d0 vadd.i64 d0, d10 +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT # Calc new W[6]-W[7] vext.8 q6, q11, q12, #8 vshl.u64 q4, q10, #45 @@ -4594,6 +4689,47 @@ L_SHA512_transform_neon_len_start: vshr.u64 q6, #7 veor q5, q6 vadd.i64 q11, q5 +#else + # Calc new W[6]-W[7] + vmov d12, d23 + vmov d13, d24 + vshl.u64 d8, d20, #45 + vshl.u64 d9, d21, #45 + vsri.u64 d8, d20, #19 + vsri.u64 d9, d21, #19 + vshl.u64 d10, d20, #3 + vshl.u64 d11, d21, #3 + vsri.u64 d10, d20, #61 + vsri.u64 d11, d21, #61 + veor d10, d8 + veor d11, d9 + vshr.u64 d8, d20, #6 + vshr.u64 d9, d21, #6 + veor d10, d8 + veor d11, d9 + vadd.i64 d22, d10 + vadd.i64 d23, d11 + vmov d14, d31 + vmov d15, d16 + vadd.i64 d22, d14 + vadd.i64 d23, d15 + vshl.u64 d8, d12, #63 + vshl.u64 d9, d13, #63 + vsri.u64 d8, d12, #1 + vsri.u64 d9, d13, #1 + vshl.u64 d10, d12, #56 + vshl.u64 d11, d13, #56 + vsri.u64 d10, d12, #8 + vsri.u64 d11, d13, #8 + veor d10, d8 + veor d11, d9 + vshr.u64 d12, #7 + vshr.u64 d13, #7 + veor d10, d12 + veor d11, d13 + vadd.i64 d22, d10 + vadd.i64 d23, d11 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ # Round 8 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d4, #50 @@ -4652,6 +4788,7 @@ L_SHA512_transform_neon_len_start: vadd.i64 d10, d9 vadd.i64 d2, d6 vadd.i64 d6, d10 +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT # Calc new W[8]-W[9] vext.8 q6, q12, q13, #8 vshl.u64 q4, q11, #45 @@ -4672,6 +4809,47 @@ L_SHA512_transform_neon_len_start: vshr.u64 q6, #7 veor q5, q6 vadd.i64 q12, q5 +#else + # Calc new W[8]-W[9] + vmov d12, d25 + vmov d13, d26 + vshl.u64 d8, d22, #45 + vshl.u64 d9, d23, #45 + vsri.u64 d8, d22, #19 + vsri.u64 d9, d23, #19 + vshl.u64 d10, d22, #3 + vshl.u64 d11, d23, #3 + vsri.u64 d10, d22, #61 + vsri.u64 d11, d23, #61 + veor d10, d8 + veor d11, d9 + vshr.u64 d8, d22, #6 + vshr.u64 d9, d23, #6 + veor d10, d8 + veor d11, d9 + vadd.i64 d24, d10 + vadd.i64 d25, d11 + vmov d14, d17 + vmov d15, d18 + vadd.i64 d24, d14 + vadd.i64 d25, d15 + vshl.u64 d8, d12, #63 + vshl.u64 d9, d13, #63 + vsri.u64 d8, d12, #1 + vsri.u64 d9, d13, #1 + vshl.u64 d10, d12, #56 + vshl.u64 d11, d13, #56 + vsri.u64 d10, d12, #8 + vsri.u64 d11, d13, #8 + veor d10, d8 + veor d11, d9 + vshr.u64 d12, #7 + vshr.u64 d13, #7 + veor d10, d12 + veor d11, d13 + vadd.i64 d24, d10 + vadd.i64 d25, d11 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ # Round 10 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d2, #50 @@ -4730,6 +4908,7 @@ L_SHA512_transform_neon_len_start: vadd.i64 d10, d9 vadd.i64 d0, d4 vadd.i64 d4, d10 +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT # Calc new W[10]-W[11] vext.8 q6, q13, q14, #8 vshl.u64 q4, q12, #45 @@ -4750,6 +4929,47 @@ L_SHA512_transform_neon_len_start: vshr.u64 q6, #7 veor q5, q6 vadd.i64 q13, q5 +#else + # Calc new W[10]-W[11] + vmov d12, d27 + vmov d13, d28 + vshl.u64 d8, d24, #45 + vshl.u64 d9, d25, #45 + vsri.u64 d8, d24, #19 + vsri.u64 d9, d25, #19 + vshl.u64 d10, d24, #3 + vshl.u64 d11, d25, #3 + vsri.u64 d10, d24, #61 + vsri.u64 d11, d25, #61 + veor d10, d8 + veor d11, d9 + vshr.u64 d8, d24, #6 + vshr.u64 d9, d25, #6 + veor d10, d8 + veor d11, d9 + vadd.i64 d26, d10 + vadd.i64 d27, d11 + vmov d14, d19 + vmov d15, d20 + vadd.i64 d26, d14 + vadd.i64 d27, d15 + vshl.u64 d8, d12, #63 + vshl.u64 d9, d13, #63 + vsri.u64 d8, d12, #1 + vsri.u64 d9, d13, #1 + vshl.u64 d10, d12, #56 + vshl.u64 d11, d13, #56 + vsri.u64 d10, d12, #8 + vsri.u64 d11, d13, #8 + veor d10, d8 + veor d11, d9 + vshr.u64 d12, #7 + vshr.u64 d13, #7 + veor d10, d12 + veor d11, d13 + vadd.i64 d26, d10 + vadd.i64 d27, d11 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ # Round 12 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d0, #50 @@ -4808,6 +5028,7 @@ L_SHA512_transform_neon_len_start: vadd.i64 d10, d9 vadd.i64 d6, d2 vadd.i64 d2, d10 +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT # Calc new W[12]-W[13] vext.8 q6, q14, q15, #8 vshl.u64 q4, q13, #45 @@ -4828,6 +5049,47 @@ L_SHA512_transform_neon_len_start: vshr.u64 q6, #7 veor q5, q6 vadd.i64 q14, q5 +#else + # Calc new W[12]-W[13] + vmov d12, d29 + vmov d13, d30 + vshl.u64 d8, d26, #45 + vshl.u64 d9, d27, #45 + vsri.u64 d8, d26, #19 + vsri.u64 d9, d27, #19 + vshl.u64 d10, d26, #3 + vshl.u64 d11, d27, #3 + vsri.u64 d10, d26, #61 + vsri.u64 d11, d27, #61 + veor d10, d8 + veor d11, d9 + vshr.u64 d8, d26, #6 + vshr.u64 d9, d27, #6 + veor d10, d8 + veor d11, d9 + vadd.i64 d28, d10 + vadd.i64 d29, d11 + vmov d14, d21 + vmov d15, d22 + vadd.i64 d28, d14 + vadd.i64 d29, d15 + vshl.u64 d8, d12, #63 + vshl.u64 d9, d13, #63 + vsri.u64 d8, d12, #1 + vsri.u64 d9, d13, #1 + vshl.u64 d10, d12, #56 + vshl.u64 d11, d13, #56 + vsri.u64 d10, d12, #8 + vsri.u64 d11, d13, #8 + veor d10, d8 + veor d11, d9 + vshr.u64 d12, #7 + vshr.u64 d13, #7 + veor d10, d12 + veor d11, d13 + vadd.i64 d28, d10 + vadd.i64 d29, d11 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ # Round 14 vld1.64 {d12}, [r3:64]! vshl.u64 d8, d6, #50 @@ -4886,6 +5148,7 @@ L_SHA512_transform_neon_len_start: vadd.i64 d10, d9 vadd.i64 d4, d0 vadd.i64 d0, d10 +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT # Calc new W[14]-W[15] vext.8 q6, q15, q8, #8 vshl.u64 q4, q14, #45 @@ -4906,6 +5169,47 @@ L_SHA512_transform_neon_len_start: vshr.u64 q6, #7 veor q5, q6 vadd.i64 q15, q5 +#else + # Calc new W[14]-W[15] + vmov d12, d31 + vmov d13, d16 + vshl.u64 d8, d28, #45 + vshl.u64 d9, d29, #45 + vsri.u64 d8, d28, #19 + vsri.u64 d9, d29, #19 + vshl.u64 d10, d28, #3 + vshl.u64 d11, d29, #3 + vsri.u64 d10, d28, #61 + vsri.u64 d11, d29, #61 + veor d10, d8 + veor d11, d9 + vshr.u64 d8, d28, #6 + vshr.u64 d9, d29, #6 + veor d10, d8 + veor d11, d9 + vadd.i64 d30, d10 + vadd.i64 d31, d11 + vmov d14, d23 + vmov d15, d24 + vadd.i64 d30, d14 + vadd.i64 d31, d15 + vshl.u64 d8, d12, #63 + vshl.u64 d9, d13, #63 + vsri.u64 d8, d12, #1 + vsri.u64 d9, d13, #1 + vshl.u64 d10, d12, #56 + vshl.u64 d11, d13, #56 + vsri.u64 d10, d12, #8 + vsri.u64 d11, d13, #8 + veor d10, d8 + veor d11, d9 + vshr.u64 d12, #7 + vshr.u64 d13, #7 + veor d10, d12 + veor d11, d13 + vadd.i64 d30, d10 + vadd.i64 d31, d11 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ subs r12, r12, #1 bne L_SHA512_transform_neon_len_start # Round 0 @@ -5374,10 +5678,21 @@ L_SHA512_transform_neon_len_start: vadd.i64 d0, d10 # Add in digest from start vldm.64 r0, {d8-d15} +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT vadd.i64 q0, q0, q4 vadd.i64 q1, q1, q5 vadd.i64 q2, q2, q6 vadd.i64 q3, q3, q7 +#else + vadd.i64 d0, d0, d8 + vadd.i64 d1, d1, d9 + vadd.i64 d2, d2, d10 + vadd.i64 d3, d3, d11 + vadd.i64 d4, d4, d12 + vadd.i64 d5, d5, d13 + vadd.i64 d6, d6, d14 + vadd.i64 d7, d7, d15 +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ vstm.64 r0, {d0-d7} subs r2, r2, #0x80 bne L_SHA512_transform_neon_len_begin diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index 4d5eb18c1..5c17f0151 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -3662,7 +3662,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "L_SHA512_transform_neon_len_begin_%=: \n\t" /* Load W */ "vldm.64 %[data]!, {d16-d31}\n\t" -#ifndef WOLFSSL_ARM_ARCH_NO_VREV +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT "vrev64.8 q8, q8\n\t" "vrev64.8 q9, q9\n\t" "vrev64.8 q10, q10\n\t" @@ -3672,55 +3672,23 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vrev64.8 q14, q14\n\t" "vrev64.8 q15, q15\n\t" #else - "vshl.i16 q4, q8, #8\n\t" - "vshl.i16 q5, q9, #8\n\t" - "vsri.i16 q4, q8, #8\n\t" - "vsri.i16 q5, q9, #8\n\t" - "vshl.i32 q6, q4, #16\n\t" - "vshl.i32 q7, q5, #16\n\t" - "vsri.i32 q6, q4, #16\n\t" - "vsri.i32 q7, q5, #16\n\t" - "vshl.i64 q8, q6, #32\n\t" - "vshl.i64 q9, q7, #32\n\t" - "vsri.i64 q8, q6, #32\n\t" - "vsri.i64 q9, q7, #32\n\t" - "vshl.i16 q4, q10, #8\n\t" - "vshl.i16 q5, q11, #8\n\t" - "vsri.i16 q4, q10, #8\n\t" - "vsri.i16 q5, q11, #8\n\t" - "vshl.i32 q6, q4, #16\n\t" - "vshl.i32 q7, q5, #16\n\t" - "vsri.i32 q6, q4, #16\n\t" - "vsri.i32 q7, q5, #16\n\t" - "vshl.i64 q10, q6, #32\n\t" - "vshl.i64 q11, q7, #32\n\t" - "vsri.i64 q10, q6, #32\n\t" - "vsri.i64 q11, q7, #32\n\t" - "vshl.i16 q4, q12, #8\n\t" - "vshl.i16 q5, q13, #8\n\t" - "vsri.i16 q4, q12, #8\n\t" - "vsri.i16 q5, q13, #8\n\t" - "vshl.i32 q6, q4, #16\n\t" - "vshl.i32 q7, q5, #16\n\t" - "vsri.i32 q6, q4, #16\n\t" - "vsri.i32 q7, q5, #16\n\t" - "vshl.i64 q12, q6, #32\n\t" - "vshl.i64 q13, q7, #32\n\t" - "vsri.i64 q12, q6, #32\n\t" - "vsri.i64 q13, q7, #32\n\t" - "vshl.i16 q4, q14, #8\n\t" - "vshl.i16 q5, q15, #8\n\t" - "vsri.i16 q4, q14, #8\n\t" - "vsri.i16 q5, q15, #8\n\t" - "vshl.i32 q6, q4, #16\n\t" - "vshl.i32 q7, q5, #16\n\t" - "vsri.i32 q6, q4, #16\n\t" - "vsri.i32 q7, q5, #16\n\t" - "vshl.i64 q14, q6, #32\n\t" - "vshl.i64 q15, q7, #32\n\t" - "vsri.i64 q14, q6, #32\n\t" - "vsri.i64 q15, q7, #32\n\t" -#endif /* WOLFSSL_ARM_ARCH_NO_VREV */ + "vrev64.8 d16, d16\n\t" + "vrev64.8 d17, d17\n\t" + "vrev64.8 d18, d18\n\t" + "vrev64.8 d19, d19\n\t" + "vrev64.8 d20, d20\n\t" + "vrev64.8 d21, d21\n\t" + "vrev64.8 d22, d22\n\t" + "vrev64.8 d23, d23\n\t" + "vrev64.8 d24, d24\n\t" + "vrev64.8 d25, d25\n\t" + "vrev64.8 d26, d26\n\t" + "vrev64.8 d27, d27\n\t" + "vrev64.8 d28, d28\n\t" + "vrev64.8 d29, d29\n\t" + "vrev64.8 d30, d30\n\t" + "vrev64.8 d31, d31\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ "mov r3, %[L_SHA512_transform_neon_len_k]\n\t" "mov r12, #4\n\t" /* Start of 16 rounds */ @@ -3784,6 +3752,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d10, d9\n\t" "vadd.i64 d2, d6\n\t" "vadd.i64 d6, d10\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT /* Calc new W[0]-W[1] */ "vext.8 q6, q8, q9, #8\n\t" "vshl.u64 q4, q15, #45\n\t" @@ -3804,6 +3773,47 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vshr.u64 q6, #7\n\t" "veor q5, q6\n\t" "vadd.i64 q8, q5\n\t" +#else + /* Calc new W[0]-W[1] */ + "vmov d12, d17\n\t" + "vmov d13, d18\n\t" + "vshl.u64 d8, d30, #45\n\t" + "vshl.u64 d9, d31, #45\n\t" + "vsri.u64 d8, d30, #19\n\t" + "vsri.u64 d9, d31, #19\n\t" + "vshl.u64 d10, d30, #3\n\t" + "vshl.u64 d11, d31, #3\n\t" + "vsri.u64 d10, d30, #61\n\t" + "vsri.u64 d11, d31, #61\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d8, d30, #6\n\t" + "vshr.u64 d9, d31, #6\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vadd.i64 d16, d10\n\t" + "vadd.i64 d17, d11\n\t" + "vmov d14, d25\n\t" + "vmov d15, d26\n\t" + "vadd.i64 d16, d14\n\t" + "vadd.i64 d17, d15\n\t" + "vshl.u64 d8, d12, #63\n\t" + "vshl.u64 d9, d13, #63\n\t" + "vsri.u64 d8, d12, #1\n\t" + "vsri.u64 d9, d13, #1\n\t" + "vshl.u64 d10, d12, #56\n\t" + "vshl.u64 d11, d13, #56\n\t" + "vsri.u64 d10, d12, #8\n\t" + "vsri.u64 d11, d13, #8\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d12, #7\n\t" + "vshr.u64 d13, #7\n\t" + "veor d10, d12\n\t" + "veor d11, d13\n\t" + "vadd.i64 d16, d10\n\t" + "vadd.i64 d17, d11\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 2 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d2, #50\n\t" @@ -3862,6 +3872,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d10, d9\n\t" "vadd.i64 d0, d4\n\t" "vadd.i64 d4, d10\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT /* Calc new W[2]-W[3] */ "vext.8 q6, q9, q10, #8\n\t" "vshl.u64 q4, q8, #45\n\t" @@ -3882,6 +3893,47 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vshr.u64 q6, #7\n\t" "veor q5, q6\n\t" "vadd.i64 q9, q5\n\t" +#else + /* Calc new W[2]-W[3] */ + "vmov d12, d19\n\t" + "vmov d13, d20\n\t" + "vshl.u64 d8, d16, #45\n\t" + "vshl.u64 d9, d17, #45\n\t" + "vsri.u64 d8, d16, #19\n\t" + "vsri.u64 d9, d17, #19\n\t" + "vshl.u64 d10, d16, #3\n\t" + "vshl.u64 d11, d17, #3\n\t" + "vsri.u64 d10, d16, #61\n\t" + "vsri.u64 d11, d17, #61\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d8, d16, #6\n\t" + "vshr.u64 d9, d17, #6\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vadd.i64 d18, d10\n\t" + "vadd.i64 d19, d11\n\t" + "vmov d14, d27\n\t" + "vmov d15, d28\n\t" + "vadd.i64 d18, d14\n\t" + "vadd.i64 d19, d15\n\t" + "vshl.u64 d8, d12, #63\n\t" + "vshl.u64 d9, d13, #63\n\t" + "vsri.u64 d8, d12, #1\n\t" + "vsri.u64 d9, d13, #1\n\t" + "vshl.u64 d10, d12, #56\n\t" + "vshl.u64 d11, d13, #56\n\t" + "vsri.u64 d10, d12, #8\n\t" + "vsri.u64 d11, d13, #8\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d12, #7\n\t" + "vshr.u64 d13, #7\n\t" + "veor d10, d12\n\t" + "veor d11, d13\n\t" + "vadd.i64 d18, d10\n\t" + "vadd.i64 d19, d11\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 4 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d0, #50\n\t" @@ -3940,6 +3992,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d10, d9\n\t" "vadd.i64 d6, d2\n\t" "vadd.i64 d2, d10\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT /* Calc new W[4]-W[5] */ "vext.8 q6, q10, q11, #8\n\t" "vshl.u64 q4, q9, #45\n\t" @@ -3960,6 +4013,47 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vshr.u64 q6, #7\n\t" "veor q5, q6\n\t" "vadd.i64 q10, q5\n\t" +#else + /* Calc new W[4]-W[5] */ + "vmov d12, d21\n\t" + "vmov d13, d22\n\t" + "vshl.u64 d8, d18, #45\n\t" + "vshl.u64 d9, d19, #45\n\t" + "vsri.u64 d8, d18, #19\n\t" + "vsri.u64 d9, d19, #19\n\t" + "vshl.u64 d10, d18, #3\n\t" + "vshl.u64 d11, d19, #3\n\t" + "vsri.u64 d10, d18, #61\n\t" + "vsri.u64 d11, d19, #61\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d8, d18, #6\n\t" + "vshr.u64 d9, d19, #6\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vadd.i64 d20, d10\n\t" + "vadd.i64 d21, d11\n\t" + "vmov d14, d29\n\t" + "vmov d15, d30\n\t" + "vadd.i64 d20, d14\n\t" + "vadd.i64 d21, d15\n\t" + "vshl.u64 d8, d12, #63\n\t" + "vshl.u64 d9, d13, #63\n\t" + "vsri.u64 d8, d12, #1\n\t" + "vsri.u64 d9, d13, #1\n\t" + "vshl.u64 d10, d12, #56\n\t" + "vshl.u64 d11, d13, #56\n\t" + "vsri.u64 d10, d12, #8\n\t" + "vsri.u64 d11, d13, #8\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d12, #7\n\t" + "vshr.u64 d13, #7\n\t" + "veor d10, d12\n\t" + "veor d11, d13\n\t" + "vadd.i64 d20, d10\n\t" + "vadd.i64 d21, d11\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 6 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d6, #50\n\t" @@ -4018,6 +4112,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d10, d9\n\t" "vadd.i64 d4, d0\n\t" "vadd.i64 d0, d10\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT /* Calc new W[6]-W[7] */ "vext.8 q6, q11, q12, #8\n\t" "vshl.u64 q4, q10, #45\n\t" @@ -4038,6 +4133,47 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vshr.u64 q6, #7\n\t" "veor q5, q6\n\t" "vadd.i64 q11, q5\n\t" +#else + /* Calc new W[6]-W[7] */ + "vmov d12, d23\n\t" + "vmov d13, d24\n\t" + "vshl.u64 d8, d20, #45\n\t" + "vshl.u64 d9, d21, #45\n\t" + "vsri.u64 d8, d20, #19\n\t" + "vsri.u64 d9, d21, #19\n\t" + "vshl.u64 d10, d20, #3\n\t" + "vshl.u64 d11, d21, #3\n\t" + "vsri.u64 d10, d20, #61\n\t" + "vsri.u64 d11, d21, #61\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d8, d20, #6\n\t" + "vshr.u64 d9, d21, #6\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vadd.i64 d22, d10\n\t" + "vadd.i64 d23, d11\n\t" + "vmov d14, d31\n\t" + "vmov d15, d16\n\t" + "vadd.i64 d22, d14\n\t" + "vadd.i64 d23, d15\n\t" + "vshl.u64 d8, d12, #63\n\t" + "vshl.u64 d9, d13, #63\n\t" + "vsri.u64 d8, d12, #1\n\t" + "vsri.u64 d9, d13, #1\n\t" + "vshl.u64 d10, d12, #56\n\t" + "vshl.u64 d11, d13, #56\n\t" + "vsri.u64 d10, d12, #8\n\t" + "vsri.u64 d11, d13, #8\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d12, #7\n\t" + "vshr.u64 d13, #7\n\t" + "veor d10, d12\n\t" + "veor d11, d13\n\t" + "vadd.i64 d22, d10\n\t" + "vadd.i64 d23, d11\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 8 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d4, #50\n\t" @@ -4096,6 +4232,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d10, d9\n\t" "vadd.i64 d2, d6\n\t" "vadd.i64 d6, d10\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT /* Calc new W[8]-W[9] */ "vext.8 q6, q12, q13, #8\n\t" "vshl.u64 q4, q11, #45\n\t" @@ -4116,6 +4253,47 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vshr.u64 q6, #7\n\t" "veor q5, q6\n\t" "vadd.i64 q12, q5\n\t" +#else + /* Calc new W[8]-W[9] */ + "vmov d12, d25\n\t" + "vmov d13, d26\n\t" + "vshl.u64 d8, d22, #45\n\t" + "vshl.u64 d9, d23, #45\n\t" + "vsri.u64 d8, d22, #19\n\t" + "vsri.u64 d9, d23, #19\n\t" + "vshl.u64 d10, d22, #3\n\t" + "vshl.u64 d11, d23, #3\n\t" + "vsri.u64 d10, d22, #61\n\t" + "vsri.u64 d11, d23, #61\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d8, d22, #6\n\t" + "vshr.u64 d9, d23, #6\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vadd.i64 d24, d10\n\t" + "vadd.i64 d25, d11\n\t" + "vmov d14, d17\n\t" + "vmov d15, d18\n\t" + "vadd.i64 d24, d14\n\t" + "vadd.i64 d25, d15\n\t" + "vshl.u64 d8, d12, #63\n\t" + "vshl.u64 d9, d13, #63\n\t" + "vsri.u64 d8, d12, #1\n\t" + "vsri.u64 d9, d13, #1\n\t" + "vshl.u64 d10, d12, #56\n\t" + "vshl.u64 d11, d13, #56\n\t" + "vsri.u64 d10, d12, #8\n\t" + "vsri.u64 d11, d13, #8\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d12, #7\n\t" + "vshr.u64 d13, #7\n\t" + "veor d10, d12\n\t" + "veor d11, d13\n\t" + "vadd.i64 d24, d10\n\t" + "vadd.i64 d25, d11\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 10 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d2, #50\n\t" @@ -4174,6 +4352,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d10, d9\n\t" "vadd.i64 d0, d4\n\t" "vadd.i64 d4, d10\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT /* Calc new W[10]-W[11] */ "vext.8 q6, q13, q14, #8\n\t" "vshl.u64 q4, q12, #45\n\t" @@ -4194,6 +4373,47 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vshr.u64 q6, #7\n\t" "veor q5, q6\n\t" "vadd.i64 q13, q5\n\t" +#else + /* Calc new W[10]-W[11] */ + "vmov d12, d27\n\t" + "vmov d13, d28\n\t" + "vshl.u64 d8, d24, #45\n\t" + "vshl.u64 d9, d25, #45\n\t" + "vsri.u64 d8, d24, #19\n\t" + "vsri.u64 d9, d25, #19\n\t" + "vshl.u64 d10, d24, #3\n\t" + "vshl.u64 d11, d25, #3\n\t" + "vsri.u64 d10, d24, #61\n\t" + "vsri.u64 d11, d25, #61\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d8, d24, #6\n\t" + "vshr.u64 d9, d25, #6\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vadd.i64 d26, d10\n\t" + "vadd.i64 d27, d11\n\t" + "vmov d14, d19\n\t" + "vmov d15, d20\n\t" + "vadd.i64 d26, d14\n\t" + "vadd.i64 d27, d15\n\t" + "vshl.u64 d8, d12, #63\n\t" + "vshl.u64 d9, d13, #63\n\t" + "vsri.u64 d8, d12, #1\n\t" + "vsri.u64 d9, d13, #1\n\t" + "vshl.u64 d10, d12, #56\n\t" + "vshl.u64 d11, d13, #56\n\t" + "vsri.u64 d10, d12, #8\n\t" + "vsri.u64 d11, d13, #8\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d12, #7\n\t" + "vshr.u64 d13, #7\n\t" + "veor d10, d12\n\t" + "veor d11, d13\n\t" + "vadd.i64 d26, d10\n\t" + "vadd.i64 d27, d11\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 12 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d0, #50\n\t" @@ -4252,6 +4472,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d10, d9\n\t" "vadd.i64 d6, d2\n\t" "vadd.i64 d2, d10\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT /* Calc new W[12]-W[13] */ "vext.8 q6, q14, q15, #8\n\t" "vshl.u64 q4, q13, #45\n\t" @@ -4272,6 +4493,47 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vshr.u64 q6, #7\n\t" "veor q5, q6\n\t" "vadd.i64 q14, q5\n\t" +#else + /* Calc new W[12]-W[13] */ + "vmov d12, d29\n\t" + "vmov d13, d30\n\t" + "vshl.u64 d8, d26, #45\n\t" + "vshl.u64 d9, d27, #45\n\t" + "vsri.u64 d8, d26, #19\n\t" + "vsri.u64 d9, d27, #19\n\t" + "vshl.u64 d10, d26, #3\n\t" + "vshl.u64 d11, d27, #3\n\t" + "vsri.u64 d10, d26, #61\n\t" + "vsri.u64 d11, d27, #61\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d8, d26, #6\n\t" + "vshr.u64 d9, d27, #6\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vadd.i64 d28, d10\n\t" + "vadd.i64 d29, d11\n\t" + "vmov d14, d21\n\t" + "vmov d15, d22\n\t" + "vadd.i64 d28, d14\n\t" + "vadd.i64 d29, d15\n\t" + "vshl.u64 d8, d12, #63\n\t" + "vshl.u64 d9, d13, #63\n\t" + "vsri.u64 d8, d12, #1\n\t" + "vsri.u64 d9, d13, #1\n\t" + "vshl.u64 d10, d12, #56\n\t" + "vshl.u64 d11, d13, #56\n\t" + "vsri.u64 d10, d12, #8\n\t" + "vsri.u64 d11, d13, #8\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d12, #7\n\t" + "vshr.u64 d13, #7\n\t" + "veor d10, d12\n\t" + "veor d11, d13\n\t" + "vadd.i64 d28, d10\n\t" + "vadd.i64 d29, d11\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 14 */ "vld1.64 {d12}, [r3]!\n\t" "vshl.u64 d8, d6, #50\n\t" @@ -4330,6 +4592,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d10, d9\n\t" "vadd.i64 d4, d0\n\t" "vadd.i64 d0, d10\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT /* Calc new W[14]-W[15] */ "vext.8 q6, q15, q8, #8\n\t" "vshl.u64 q4, q14, #45\n\t" @@ -4350,6 +4613,47 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vshr.u64 q6, #7\n\t" "veor q5, q6\n\t" "vadd.i64 q15, q5\n\t" +#else + /* Calc new W[14]-W[15] */ + "vmov d12, d31\n\t" + "vmov d13, d16\n\t" + "vshl.u64 d8, d28, #45\n\t" + "vshl.u64 d9, d29, #45\n\t" + "vsri.u64 d8, d28, #19\n\t" + "vsri.u64 d9, d29, #19\n\t" + "vshl.u64 d10, d28, #3\n\t" + "vshl.u64 d11, d29, #3\n\t" + "vsri.u64 d10, d28, #61\n\t" + "vsri.u64 d11, d29, #61\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d8, d28, #6\n\t" + "vshr.u64 d9, d29, #6\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vadd.i64 d30, d10\n\t" + "vadd.i64 d31, d11\n\t" + "vmov d14, d23\n\t" + "vmov d15, d24\n\t" + "vadd.i64 d30, d14\n\t" + "vadd.i64 d31, d15\n\t" + "vshl.u64 d8, d12, #63\n\t" + "vshl.u64 d9, d13, #63\n\t" + "vsri.u64 d8, d12, #1\n\t" + "vsri.u64 d9, d13, #1\n\t" + "vshl.u64 d10, d12, #56\n\t" + "vshl.u64 d11, d13, #56\n\t" + "vsri.u64 d10, d12, #8\n\t" + "vsri.u64 d11, d13, #8\n\t" + "veor d10, d8\n\t" + "veor d11, d9\n\t" + "vshr.u64 d12, #7\n\t" + "vshr.u64 d13, #7\n\t" + "veor d10, d12\n\t" + "veor d11, d13\n\t" + "vadd.i64 d30, d10\n\t" + "vadd.i64 d31, d11\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ "subs r12, r12, #1\n\t" "bne L_SHA512_transform_neon_len_start_%=\n\t" /* Round 0 */ @@ -4818,10 +5122,21 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "vadd.i64 d0, d10\n\t" /* Add in digest from start */ "vldm.64 %[sha512], {d8-d15}\n\t" +#ifndef WOLFSSL_ARM_ARCH_NEON_64BIT "vadd.i64 q0, q0, q4\n\t" "vadd.i64 q1, q1, q5\n\t" "vadd.i64 q2, q2, q6\n\t" "vadd.i64 q3, q3, q7\n\t" +#else + "vadd.i64 d0, d0, d8\n\t" + "vadd.i64 d1, d1, d9\n\t" + "vadd.i64 d2, d2, d10\n\t" + "vadd.i64 d3, d3, d11\n\t" + "vadd.i64 d4, d4, d12\n\t" + "vadd.i64 d5, d5, d13\n\t" + "vadd.i64 d6, d6, d14\n\t" + "vadd.i64 d7, d7, d15\n\t" +#endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ "vstm.64 %[sha512], {d0-d7}\n\t" "subs %[len], %[len], #0x80\n\t" "bne L_SHA512_transform_neon_len_begin_%=\n\t" From 7062ed0a2cb7c389e91ebdf81cbd21c68a5c621e Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 16 Sep 2022 11:29:30 +1000 Subject: [PATCH 6/9] ChaCha ARM 32-bit: get debug working r7 is needed for debug. Needed to use stack explicitly rather than let compiler use r7. --- wolfcrypt/src/port/arm/armv8-chacha.c | 73 +++++++++++++++------------ 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index 83d242671..edd51e726 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -968,14 +968,16 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "v21", "v22", "v23" ); #else - word32 x[CHACHA_CHUNK_WORDS]; - word32* x_addr = x; __asm__ __volatile__ ( // The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM // https://cryptojedi.org/papers/neoncrypto-20120320.pdf ".align 2 \n\t" "LDR r14, %[input] \n\t" // load input address + #ifndef NDEBUG + "PUSH { r7 } \n\t" + #endif + "SUB sp, sp, #16*4 \n\t" "LDM r14, { r0-r12 } \n\t" // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 @@ -985,7 +987,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "VMOV d2, r4, r5 \n\t" "VMOV d3, r6, r7 \n\t" "VMOV d4, r8, r9 \n\t" - "STRD r10, r11, %[x_10] \n\t" + "STRD r10, r11, [sp, #4*10] \n\t" "VMOV d5, r10, r11 \n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 8) "LDR r11, [r14, #4*14] \n\t" @@ -1002,7 +1004,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 // 0 1 2 3 4 5 6 7 8 9 15 14 12 "VMOV d7, r11, r10 \n\t" - "STR r10, %[x_15] \n\t" + "STR r10, [sp, #4*15] \n\t" "VMOV d15, r11, r10 \n\t" "VMOV d23, r11, r10 \n\t" "MOV r10, r12 \n\t" @@ -1070,22 +1072,22 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "VSRI.I32 q1, q12, #20 \n\t" "ADD r9, r9, r11 \n\t" // 9 9 13 "VSRI.I32 q5, q13, #20 \n\t" - "STR r11, %[x_13] \n\t" + "STR r11, [sp, #4*13] \n\t" "VSRI.I32 q9, q14, #20 \n\t" - "LDR r11, %[x_15] \n\t" + "LDR r11, [sp, #4*15] \n\t" "VADD.I32 q0, q0, q1 \n\t" "EOR r4, r4, r8 \n\t" // 4 4 8 "VADD.I32 q4, q4, q5 \n\t" - "STR r8, %[x_8] \n\t" + "STR r8, [sp, #4*8] \n\t" "VADD.I32 q8, q8, q9 \n\t" - "LDR r8, %[x_10] \n\t" + "LDR r8, [sp, #4*10] \n\t" "VEOR q12, q3, q0 \n\t" "EOR r5, r5, r9 \n\t" // 5 5 9 "VEOR q13, q7, q4 \n\t" - "STR r9, %[x_9] \n\t" + "STR r9, [sp, #4*9] \n\t" "VEOR q14, q11, q8 \n\t" - "LDR r9, %[x_11] \n\t" + "LDR r9, [sp, #4*11] \n\t" // SIMD instructions don't support rotation so we have to cheat using shifts and a help register "VSHL.I32 q3, q12, #8 \n\t" "ROR r4, r4, #25 \n\t" // 4 4 @@ -1199,24 +1201,24 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "VSHL.I32 q9, q14, #12 \n\t" "ADD r8, r8, r11 \n\t" // 10 10 15 "VSRI.I32 q1, q12, #20 \n\t" - "STR r11, %[x_15] \n\t" + "STR r11, [sp, #4*15] \n\t" "VSRI.I32 q5, q13, #20 \n\t" - "LDR r11, %[x_13] \n\t" + "LDR r11, [sp, #4*13] \n\t" "VSRI.I32 q9, q14, #20 \n\t" "ADD r9, r9, r10 \n\t" // 11 11 12 "VADD.I32 q0, q0, q1 \n\t" "EOR r5, r5, r8 \n\t" // 5 5 10 "VADD.I32 q4, q4, q5 \n\t" - "STR r8, %[x_10] \n\t" + "STR r8, [sp, #4*10] \n\t" "VADD.I32 q8, q8, q9 \n\t" - "LDR r8, %[x_8] \n\t" + "LDR r8, [sp, #4*8] \n\t" "VEOR q12, q3, q0 \n\t" "EOR r6, r6, r9 \n\t" // 6 6 11 "VEOR q13, q7, q4 \n\t" - "STR r9, %[x_11] \n\t" + "STR r9, [sp, #4*11] \n\t" "VEOR q14, q11, q8 \n\t" - "LDR r9, %[x_9] \n\t" + "LDR r9, [sp, #4*9] \n\t" // SIMD instructions don't support rotation so we have to cheat using shifts and a help register "VSHL.I32 q3, q12, #8 \n\t" "ROR r5, r5, #25 \n\t" // 5 5 @@ -1286,18 +1288,26 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "BNE L_chacha20_arm32_256_loop_%= \n\t" - "LDR r14, %[x_addr] \n\t" // load address of x to r14 // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 // 0 1 2 3 4 5 6 7 8 9 12 13 14 "ADD r10, r10, #3 \n\t" // add three here to make later NEON easier - "STM r14, { r0-r9 } \n\t" - "STRD r10, r11, [r14, #4*12] \n\t" + "STM sp, { r0-r9 } \n\t" + "STRD r10, r11, [sp, #4*12] \n\t" + "STR r12, [sp, #4*14] \n\t" + "ADD sp, sp, #16*4 \n\t" + #ifndef NDEBUG + "POP { r7 } \n\t" + #endif "LDR r9, %[input] \n\t" // load input address - "STR r12, [r14, #4*14] \n\t" "LDR r10, %[c] \n\t" // load c address "VLDM r9, { q12-q15 } \n\t" "LDR r12, %[m] \n\t" // load m address + #ifndef NDEBUG + "SUB sp, sp, #17*4 \n\t" + #else + "SUB sp, sp, #16*4 \n\t" + #endif "VADD.I32 q0, q0, q12 \n\t" "VADD.I32 q1, q1, q13 \n\t" @@ -1329,7 +1339,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "VEOR q3, q3, q15 \n\t" "VSTM r10!, { q0-q3 } \n\t" // store to c - "VLDM r14, { q0-q3 } \n\t " // load final block from x + "VLDM sp, { q0-q3 } \n\t " // load final block from x "VLDM r12!, { q12-q15 } \n\t" // load m "VEOR q4, q4, q12 \n\t" @@ -1358,20 +1368,21 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "VEOR q3, q3, q15 \n\t" "VSTM r10!, { q0-q3 } \n\t" // store to c - : [c] "+m" (c), - [x_0] "=m" (x), - [x_8] "=m" (x[8]), - [x_9] "=m" (x[9]), - [x_10] "=m" (x[10]), - [x_11] "=m" (x[11]), - [x_13] "=m" (x[13]), - [x_15] "=m" (x[15]) + #ifndef NDEBUG + "ADD sp, sp, #17*4 \n\t" + #else + "ADD sp, sp, #16*4 \n\t" + #endif + : [c] "+m" (c) : [rounds] "I" (ROUNDS/2), [input] "m" (input), [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES), - [m] "m" (m), [x_addr] "m" (x_addr) + [m] "m" (m) : "memory", "cc", "r0", "r1", "r2", "r3", - "r4", "r5", "r6", "r7", + "r4", "r5", "r6", + #ifdef NDEBUG + "r7", + #endif "r8", "r9", "r10", "r11", "r12", "r14", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", From ca392fb200dea216c13cb6957ece5813b7c6c88d Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 21 Sep 2022 09:58:16 +1000 Subject: [PATCH 7/9] ARM32 ASM Inline: fix SHA-256 inline asm to compile --- wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c index 06ea84e1f..e81fd7939 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -2499,7 +2499,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "bne L_SHA256_transform_neon_len_begin_%=\n\t" "add sp, sp, #24\n\t" : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len) - : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k), [L_SHA256_transform_neon_len_k] "r" (L_SHA256_transform_neon_len_k) + : [L_SHA256_transform_neon_len_k] "r" (L_SHA256_transform_neon_len_k) : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11" ); } From 2578f2c8f2836f7f087b52aa7cb19ef13286b610 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 21 Sep 2022 10:45:50 +1000 Subject: [PATCH 8/9] ARMv8 32 Inline ASM: fixes Force parameters to use specific registers. Shift/rotate amount must have '#' prepended. --- .../src/port/arm/armv8-32-curve25519_c.c | 181 ++- .../src/port/arm/armv8-32-sha256-asm_c.c | 656 +++++----- .../src/port/arm/armv8-32-sha512-asm_c.c | 1104 +++++++++-------- 3 files changed, 1044 insertions(+), 897 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 7a5be1771..485643554 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -39,6 +39,7 @@ void fe_init() { + __asm__ __volatile__ ( "\n\t" : @@ -47,8 +48,11 @@ void fe_init() ); } -void fe_frombytes(fe out, const unsigned char* in) +void fe_frombytes(fe out_p, const unsigned char* in_p) { + register fe out asm ("r0") = out_p; + register const unsigned char* in asm ("r1") = in_p; + __asm__ __volatile__ ( "ldrd r2, r3, [%[in]]\n\t" "ldrd r12, lr, [%[in], #8]\n\t" @@ -65,8 +69,11 @@ void fe_frombytes(fe out, const unsigned char* in) ); } -void fe_tobytes(unsigned char* out, const fe n) +void fe_tobytes(unsigned char* out_p, const fe n_p) { + register unsigned char* out asm ("r0") = out_p; + register const fe n asm ("r1") = n_p; + __asm__ __volatile__ ( "ldrd r2, r3, [%[n]]\n\t" "ldrd r12, lr, [%[n], #8]\n\t" @@ -101,8 +108,10 @@ void fe_tobytes(unsigned char* out, const fe n) ); } -void fe_1(fe n) +void fe_1(fe n_p) { + register fe n asm ("r0") = n_p; + __asm__ __volatile__ ( /* Set one */ "mov r2, #1\n\t" @@ -117,8 +126,10 @@ void fe_1(fe n) ); } -void fe_0(fe n) +void fe_0(fe n_p) { + register fe n asm ("r0") = n_p; + __asm__ __volatile__ ( /* Set zero */ "mov r1, #0\n\t" @@ -132,8 +143,11 @@ void fe_0(fe n) ); } -void fe_copy(fe r, const fe a) +void fe_copy(fe r_p, const fe a_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + __asm__ __volatile__ ( /* Copy */ "ldrd r2, r3, [%[a]]\n\t" @@ -150,8 +164,12 @@ void fe_copy(fe r, const fe a) ); } -void fe_sub(fe r, const fe a, const fe b) +void fe_sub(fe r_p, const fe a_p, const fe b_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + register const fe b asm ("r2") = b_p; + __asm__ __volatile__ ( /* Sub */ "ldrd r12, lr, [%[a]]\n\t" @@ -198,8 +216,12 @@ void fe_sub(fe r, const fe a, const fe b) ); } -void fe_add(fe r, const fe a, const fe b) +void fe_add(fe r_p, const fe a_p, const fe b_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + register const fe b asm ("r2") = b_p; + __asm__ __volatile__ ( /* Add */ "ldrd r12, lr, [%[a]]\n\t" @@ -246,8 +268,11 @@ void fe_add(fe r, const fe a, const fe b) ); } -void fe_neg(fe r, const fe a) +void fe_neg(fe r_p, const fe a_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + __asm__ __volatile__ ( "mov r5, #-1\n\t" "mov r4, #-19\n\t" @@ -274,8 +299,10 @@ void fe_neg(fe r, const fe a) ); } -int fe_isnonzero(const fe a) +int fe_isnonzero(const fe a_p) { + register const fe a asm ("r0") = a_p; + __asm__ __volatile__ ( "ldrd r2, r3, [%[a]]\n\t" "ldrd r12, lr, [%[a], #8]\n\t" @@ -314,8 +341,10 @@ int fe_isnonzero(const fe a) return (uint32_t)(size_t)a; } -int fe_isnegative(const fe a) +int fe_isnegative(const fe a_p) { + register const fe a asm ("r0") = a_p; + __asm__ __volatile__ ( "ldrd r2, r3, [%[a]]\n\t" "ldrd r12, lr, [%[a], #8]\n\t" @@ -340,8 +369,12 @@ int fe_isnegative(const fe a) return (uint32_t)(size_t)a; } -void fe_cmov_table(fe* r, fe* base, signed char b) +void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) { + register fe* r asm ("r0") = r_p; + register fe* base asm ("r1") = base_p; + register signed char b asm ("r2") = b_p; + __asm__ __volatile__ ( "sxtb %[b], %[b]\n\t" "sbfx r7, %[b], #7, #1\n\t" @@ -1312,8 +1345,12 @@ void fe_cmov_table(fe* r, fe* base, signed char b) ); } -void fe_mul(fe r, const fe a, const fe b) +void fe_mul(fe r_p, const fe a_p, const fe b_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + register const fe b asm ("r2") = b_p; + __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" /* Multiply */ @@ -1842,8 +1879,11 @@ void fe_mul(fe r, const fe a, const fe b) ); } -void fe_sq(fe r, const fe a) +void fe_sq(fe r_p, const fe a_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" /* Square */ @@ -2264,8 +2304,11 @@ void fe_sq(fe r, const fe a) ); } -void fe_mul121666(fe r, fe a) +void fe_mul121666(fe r_p, fe a_p) { + register fe r asm ("r0") = r_p; + register fe a asm ("r1") = a_p; + __asm__ __volatile__ ( /* Multiply by 121666 */ "ldrd r2, r3, [%[a]]\n\t" @@ -2319,8 +2362,11 @@ void fe_mul121666(fe r, fe a) ); } -void fe_sq2(fe r, const fe a) +void fe_sq2(fe r_p, const fe a_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" /* Square * 2 */ @@ -2756,8 +2802,11 @@ void fe_sq2(fe r, const fe a) ); } -void fe_invert(fe r, const fe a) +void fe_invert(fe r_p, const fe a_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + __asm__ __volatile__ ( "sub sp, sp, #0x88\n\t" /* Invert */ @@ -2915,8 +2964,12 @@ void fe_invert(fe r, const fe a) ); } -int curve25519(byte* r, const byte* n, const byte* a) +int curve25519(byte* r_p, const byte* n_p, const byte* a_p) { + register byte* r asm ("r0") = r_p; + register const byte* n asm ("r1") = n_p; + register const byte* a asm ("r2") = a_p; + __asm__ __volatile__ ( "sub sp, sp, #0xbc\n\t" "str %[r], [sp, #160]\n\t" @@ -3694,8 +3747,11 @@ int curve25519(byte* r, const byte* n, const byte* a) return (uint32_t)(size_t)r; } -void fe_pow22523(fe r, const fe a) +void fe_pow22523(fe r_p, const fe a_p) { + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + __asm__ __volatile__ ( "sub sp, sp, #0x68\n\t" /* pow22523 */ @@ -3853,8 +3909,16 @@ void fe_pow22523(fe r, const fe a) ); } -void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) +void fe_ge_to_p2(fe rx_p, fe ry_p, fe rz_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p) { + register fe rx asm ("r0") = rx_p; + register fe ry asm ("r1") = ry_p; + register fe rz asm ("r2") = rz_p; + register const fe px asm ("r3") = px_p; + register const fe py asm ("r4") = py_p; + register const fe pz asm ("r5") = pz_p; + register const fe pt asm ("r6") = pt_p; + __asm__ __volatile__ ( "sub sp, sp, #16\n\t" "str %[rx], [sp]\n\t" @@ -3883,8 +3947,17 @@ void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, con (void)pt; } -void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) +void fe_ge_to_p3(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p) { + register fe rx asm ("r0") = rx_p; + register fe ry asm ("r1") = ry_p; + register fe rz asm ("r2") = rz_p; + register fe rt asm ("r3") = rt_p; + register const fe px asm ("r4") = px_p; + register const fe py asm ("r5") = py_p; + register const fe pz asm ("r6") = pz_p; + register const fe pt asm ("r7") = pt_p; + __asm__ __volatile__ ( "sub sp, sp, #16\n\t" "str %[rx], [sp]\n\t" @@ -3918,8 +3991,16 @@ void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe (void)pt; } -void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) +void fe_ge_dbl(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p) { + register fe rx asm ("r0") = rx_p; + register fe ry asm ("r1") = ry_p; + register fe rz asm ("r2") = rz_p; + register fe rt asm ("r3") = rt_p; + register const fe px asm ("r4") = px_p; + register const fe py asm ("r5") = py_p; + register const fe pz asm ("r6") = pz_p; + __asm__ __volatile__ ( "sub sp, sp, #16\n\t" "str %[rx], [sp]\n\t" @@ -4175,8 +4256,20 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz (void)pz; } -void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +void fe_ge_madd(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p, const fe qxy2d_p, const fe qyplusx_p, const fe qyminusx_p) { + register fe rx asm ("r0") = rx_p; + register fe ry asm ("r1") = ry_p; + register fe rz asm ("r2") = rz_p; + register fe rt asm ("r3") = rt_p; + register const fe px asm ("r4") = px_p; + register const fe py asm ("r5") = py_p; + register const fe pz asm ("r6") = pz_p; + register const fe pt asm ("r7") = pt_p; + register const fe qxy2d asm ("r8") = qxy2d_p; + register const fe qyplusx asm ("r9") = qyplusx_p; + register const fe qyminusx asm ("r10") = qyminusx_p; + __asm__ __volatile__ ( "sub sp, sp, #32\n\t" "str %[rx], [sp]\n\t" @@ -4529,8 +4622,20 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p (void)qyminusx; } -void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +void fe_ge_msub(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p, const fe qxy2d_p, const fe qyplusx_p, const fe qyminusx_p) { + register fe rx asm ("r0") = rx_p; + register fe ry asm ("r1") = ry_p; + register fe rz asm ("r2") = rz_p; + register fe rt asm ("r3") = rt_p; + register const fe px asm ("r4") = px_p; + register const fe py asm ("r5") = py_p; + register const fe pz asm ("r6") = pz_p; + register const fe pt asm ("r7") = pt_p; + register const fe qxy2d asm ("r8") = qxy2d_p; + register const fe qyplusx asm ("r9") = qyplusx_p; + register const fe qyminusx asm ("r10") = qyminusx_p; + __asm__ __volatile__ ( "sub sp, sp, #32\n\t" "str %[rx], [sp]\n\t" @@ -4883,8 +4988,21 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p (void)qyminusx; } -void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +void fe_ge_add(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p, const fe qz_p, const fe qt2d_p, const fe qyplusx_p, const fe qyminusx_p) { + register fe rx asm ("r0") = rx_p; + register fe ry asm ("r1") = ry_p; + register fe rz asm ("r2") = rz_p; + register fe rt asm ("r3") = rt_p; + register const fe px asm ("r4") = px_p; + register const fe py asm ("r5") = py_p; + register const fe pz asm ("r6") = pz_p; + register const fe pt asm ("r7") = pt_p; + register const fe qz asm ("r8") = qz_p; + register const fe qt2d asm ("r9") = qt2d_p; + register const fe qyplusx asm ("r10") = qyplusx_p; + register const fe qyminusx asm ("r11") = qyminusx_p; + __asm__ __volatile__ ( "sub sp, sp, #0x60\n\t" "str %[rx], [sp]\n\t" @@ -5243,8 +5361,21 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz (void)qyminusx; } -void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +void fe_ge_sub(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p, const fe qz_p, const fe qt2d_p, const fe qyplusx_p, const fe qyminusx_p) { + register fe rx asm ("r0") = rx_p; + register fe ry asm ("r1") = ry_p; + register fe rz asm ("r2") = rz_p; + register fe rt asm ("r3") = rt_p; + register const fe px asm ("r4") = px_p; + register const fe py asm ("r5") = py_p; + register const fe pz asm ("r6") = pz_p; + register const fe pt asm ("r7") = pt_p; + register const fe qz asm ("r8") = qz_p; + register const fe qt2d asm ("r9") = qt2d_p; + register const fe qyplusx asm ("r10") = qyplusx_p; + register const fe qyminusx asm ("r11") = qyminusx_p; + __asm__ __volatile__ ( "sub sp, sp, #0x60\n\t" "str %[rx], [sp]\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c index e81fd7939..22b1331fa 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -104,9 +104,13 @@ static const uint32_t L_SHA256_transform_len_k[] = { 0xc67178f2, }; -void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); -void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) +void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p); +void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) { + register wc_Sha256* sha256 asm ("r0") = sha256_p; + register const byte* data asm ("r1") = data_p; + register word32 len asm ("r2") = len_p; + __asm__ __volatile__ ( "sub sp, sp, #0xc0\n\t" "mov r3, %[L_SHA256_transform_len_k]\n\t" @@ -169,9 +173,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #28]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -185,9 +189,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #12]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -201,10 +205,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -216,9 +220,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #24]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -232,9 +236,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #8]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -248,10 +252,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #4]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -263,9 +267,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #20]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -279,9 +283,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #4]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -295,10 +299,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #8]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -310,9 +314,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #16]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -326,9 +330,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256]]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -342,10 +346,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #12]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -357,9 +361,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #12]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -373,9 +377,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #28]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -389,10 +393,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #16]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -404,9 +408,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #8]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -420,9 +424,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #24]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -436,10 +440,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #20]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -451,9 +455,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #4]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -467,9 +471,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #20]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -483,10 +487,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #24]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -498,9 +502,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256]]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -514,9 +518,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #16]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -530,10 +534,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #28]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -545,9 +549,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #28]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -561,9 +565,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #12]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -577,10 +581,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #32]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -592,9 +596,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #24]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -608,9 +612,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #8]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -624,10 +628,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #36]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -639,9 +643,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #20]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -655,9 +659,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #4]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -671,10 +675,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #40]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -686,9 +690,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #16]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -702,9 +706,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256]]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -718,10 +722,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #44]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -733,9 +737,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #12]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -749,9 +753,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #28]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -765,10 +769,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #48]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -780,9 +784,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #8]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -796,9 +800,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #24]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -812,10 +816,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #52]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -827,9 +831,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #4]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -843,9 +847,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #20]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -859,10 +863,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #56]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -874,9 +878,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256]]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -890,9 +894,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #16]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -906,10 +910,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [sp, #60]\n\t" "ror r12, r4, #17\n\t" "ror lr, r6, #7\n\t" - "eor r12, r12, r4, ror 19\n\t" - "eor lr, lr, r6, ror 18\n\t" - "eor r12, r12, r4, lsr 10\n\t" - "eor lr, lr, r6, lsr 3\n\t" + "eor r12, r12, r4, ror #19\n\t" + "eor lr, lr, r6, ror #18\n\t" + "eor r12, r12, r4, lsr #10\n\t" + "eor lr, lr, r6, lsr #3\n\t" "add r7, r7, r5\n\t" "add r12, r12, lr\n\t" "add r7, r7, r12\n\t" @@ -924,9 +928,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #28]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -940,9 +944,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #12]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -956,9 +960,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #24]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -972,9 +976,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #8]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -988,9 +992,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #20]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1004,9 +1008,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #4]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1020,9 +1024,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #16]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1036,9 +1040,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256]]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1052,9 +1056,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #12]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1068,9 +1072,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #28]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1084,9 +1088,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #8]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1100,9 +1104,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #24]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1116,9 +1120,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #4]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1132,9 +1136,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #20]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1148,9 +1152,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256]]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1164,9 +1168,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #16]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1180,9 +1184,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #28]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1196,9 +1200,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #12]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1212,9 +1216,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #24]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1228,9 +1232,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #8]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1244,9 +1248,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #20]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1260,9 +1264,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #4]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1276,9 +1280,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #16]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1292,9 +1296,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256]]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1308,9 +1312,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #12]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1324,9 +1328,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #28]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1340,9 +1344,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #8]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1356,9 +1360,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #24]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1372,9 +1376,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256], #4]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1388,9 +1392,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #20]\n\t" "ror r12, lr, #2\n\t" "eor r8, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r9, r9, r8\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r9, r9, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1404,9 +1408,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r7, [%[sha256]]\n\t" "ror r12, lr, #6\n\t" "eor r4, r4, r5\n\t" - "eor r12, r12, lr, ror 11\n\t" + "eor r12, r12, lr, ror #11\n\t" "and r4, r4, lr\n\t" - "eor r12, r12, lr, ror 25\n\t" + "eor r12, r12, lr, ror #25\n\t" "eor r4, r4, r5\n\t" "add r7, r7, r12\n\t" "add r7, r7, r4\n\t" @@ -1420,9 +1424,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "ldr r6, [%[sha256], #16]\n\t" "ror r12, lr, #2\n\t" "eor r9, lr, r4\n\t" - "eor r12, r12, lr, ror 13\n\t" + "eor r12, r12, lr, ror #13\n\t" "and r8, r8, r9\n\t" - "eor r12, r12, lr, ror 22\n\t" + "eor r12, r12, lr, ror #22\n\t" "eor r8, r8, r4\n\t" "add r6, r6, r7\n\t" "add r7, r7, r12\n\t" @@ -1536,9 +1540,13 @@ static const uint32_t L_SHA256_transform_neon_len_k[] = { 0xc67178f2, }; -void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); -void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) +void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p); +void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) { + register wc_Sha256* sha256 asm ("r0") = sha256_p; + register const byte* data asm ("r1") = data_p; + register word32 len asm ("r2") = len_p; + __asm__ __volatile__ ( "sub sp, sp, #24\n\t" "strd %[sha256], %[data], [sp]\n\t" @@ -1578,9 +1586,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d0[0]\n\t" "ror %[sha256], r6, #6\n\t" "eor %[data], r7, r8\n\t" - "eor %[sha256], %[sha256], r6, ror 11\n\t" + "eor %[sha256], %[sha256], r6, ror #11\n\t" "and %[data], %[data], r6\n\t" - "eor %[sha256], %[sha256], r6, ror 25\n\t" + "eor %[sha256], %[sha256], r6, ror #25\n\t" "eor %[data], %[data], r8\n\t" "add r9, r9, %[sha256]\n\t" "add r9, r9, %[data]\n\t" @@ -1590,10 +1598,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r5, r5, r9\n\t" "ror %[sha256], %[len], #2\n\t" "eor %[data], %[len], r3\n\t" - "eor %[sha256], %[sha256], %[len], ror 13\n\t" + "eor %[sha256], %[sha256], %[len], ror #13\n\t" "eor r10, r3, r4\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], %[len], ror 22\n\t" + "eor %[sha256], %[sha256], %[len], ror #22\n\t" "eor %[data], %[data], r3\n\t" "add r9, r9, %[sha256]\n\t" "add r9, r9, %[data]\n\t" @@ -1605,11 +1613,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d8, d7, #15\n\t" "eor %[data], r6, r7\n\t" "vsri.u32 d8, d7, #17\n\t" - "eor %[sha256], %[sha256], r5, ror 11\n\t" + "eor %[sha256], %[sha256], r5, ror #11\n\t" "vshl.u32 d9, d7, #13\n\t" "and %[data], %[data], r5\n\t" "vsri.u32 d9, d7, #19\n\t" - "eor %[sha256], %[sha256], r5, ror 25\n\t" + "eor %[sha256], %[sha256], r5, ror #25\n\t" "veor d9, d8\n\t" "eor %[data], %[data], r7\n\t" "vshr.u32 d8, d7, #10\n\t" @@ -1629,13 +1637,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d9, d10, #14\n\t" "eor %[data], r9, %[len]\n\t" "vsri.u32 d9, d10, #18\n\t" - "eor %[sha256], %[sha256], r9, ror 13\n\t" + "eor %[sha256], %[sha256], r9, ror #13\n\t" "veor d9, d8\n\t" "eor r10, %[len], r3\n\t" "vshr.u32 d10, #3\n\t" "and %[data], %[data], r10\n\t" "veor d9, d10\n\t" - "eor %[sha256], %[sha256], r9, ror 22\n\t" + "eor %[sha256], %[sha256], r9, ror #22\n\t" "vadd.i32 d0, d9\n\t" "eor %[data], %[data], %[len]\n\t" "add r8, r8, %[sha256]\n\t" @@ -1644,9 +1652,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d1[0]\n\t" "ror %[sha256], r4, #6\n\t" "eor %[data], r5, r6\n\t" - "eor %[sha256], %[sha256], r4, ror 11\n\t" + "eor %[sha256], %[sha256], r4, ror #11\n\t" "and %[data], %[data], r4\n\t" - "eor %[sha256], %[sha256], r4, ror 25\n\t" + "eor %[sha256], %[sha256], r4, ror #25\n\t" "eor %[data], %[data], r6\n\t" "add r7, r7, %[sha256]\n\t" "add r7, r7, %[data]\n\t" @@ -1656,10 +1664,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r3, r3, r7\n\t" "ror %[sha256], r8, #2\n\t" "eor %[data], r8, r9\n\t" - "eor %[sha256], %[sha256], r8, ror 13\n\t" + "eor %[sha256], %[sha256], r8, ror #13\n\t" "eor r10, r9, %[len]\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r8, ror 22\n\t" + "eor %[sha256], %[sha256], r8, ror #22\n\t" "eor %[data], %[data], r9\n\t" "add r7, r7, %[sha256]\n\t" "add r7, r7, %[data]\n\t" @@ -1671,11 +1679,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d8, d0, #15\n\t" "eor %[data], r4, r5\n\t" "vsri.u32 d8, d0, #17\n\t" - "eor %[sha256], %[sha256], r3, ror 11\n\t" + "eor %[sha256], %[sha256], r3, ror #11\n\t" "vshl.u32 d9, d0, #13\n\t" "and %[data], %[data], r3\n\t" "vsri.u32 d9, d0, #19\n\t" - "eor %[sha256], %[sha256], r3, ror 25\n\t" + "eor %[sha256], %[sha256], r3, ror #25\n\t" "veor d9, d8\n\t" "eor %[data], %[data], r5\n\t" "vshr.u32 d8, d0, #10\n\t" @@ -1695,13 +1703,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d9, d10, #14\n\t" "eor %[data], r7, r8\n\t" "vsri.u32 d9, d10, #18\n\t" - "eor %[sha256], %[sha256], r7, ror 13\n\t" + "eor %[sha256], %[sha256], r7, ror #13\n\t" "veor d9, d8\n\t" "eor r10, r8, r9\n\t" "vshr.u32 d10, #3\n\t" "and %[data], %[data], r10\n\t" "veor d9, d10\n\t" - "eor %[sha256], %[sha256], r7, ror 22\n\t" + "eor %[sha256], %[sha256], r7, ror #22\n\t" "vadd.i32 d1, d9\n\t" "eor %[data], %[data], r8\n\t" "add r6, r6, %[sha256]\n\t" @@ -1710,9 +1718,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d2[0]\n\t" "ror %[sha256], %[len], #6\n\t" "eor %[data], r3, r4\n\t" - "eor %[sha256], %[sha256], %[len], ror 11\n\t" + "eor %[sha256], %[sha256], %[len], ror #11\n\t" "and %[data], %[data], %[len]\n\t" - "eor %[sha256], %[sha256], %[len], ror 25\n\t" + "eor %[sha256], %[sha256], %[len], ror #25\n\t" "eor %[data], %[data], r4\n\t" "add r5, r5, %[sha256]\n\t" "add r5, r5, %[data]\n\t" @@ -1722,10 +1730,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r9, r9, r5\n\t" "ror %[sha256], r6, #2\n\t" "eor %[data], r6, r7\n\t" - "eor %[sha256], %[sha256], r6, ror 13\n\t" + "eor %[sha256], %[sha256], r6, ror #13\n\t" "eor r10, r7, r8\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r6, ror 22\n\t" + "eor %[sha256], %[sha256], r6, ror #22\n\t" "eor %[data], %[data], r7\n\t" "add r5, r5, %[sha256]\n\t" "add r5, r5, %[data]\n\t" @@ -1737,11 +1745,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d8, d1, #15\n\t" "eor %[data], %[len], r3\n\t" "vsri.u32 d8, d1, #17\n\t" - "eor %[sha256], %[sha256], r9, ror 11\n\t" + "eor %[sha256], %[sha256], r9, ror #11\n\t" "vshl.u32 d9, d1, #13\n\t" "and %[data], %[data], r9\n\t" "vsri.u32 d9, d1, #19\n\t" - "eor %[sha256], %[sha256], r9, ror 25\n\t" + "eor %[sha256], %[sha256], r9, ror #25\n\t" "veor d9, d8\n\t" "eor %[data], %[data], r3\n\t" "vshr.u32 d8, d1, #10\n\t" @@ -1761,13 +1769,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d9, d10, #14\n\t" "eor %[data], r5, r6\n\t" "vsri.u32 d9, d10, #18\n\t" - "eor %[sha256], %[sha256], r5, ror 13\n\t" + "eor %[sha256], %[sha256], r5, ror #13\n\t" "veor d9, d8\n\t" "eor r10, r6, r7\n\t" "vshr.u32 d10, #3\n\t" "and %[data], %[data], r10\n\t" "veor d9, d10\n\t" - "eor %[sha256], %[sha256], r5, ror 22\n\t" + "eor %[sha256], %[sha256], r5, ror #22\n\t" "vadd.i32 d2, d9\n\t" "eor %[data], %[data], r6\n\t" "add r4, r4, %[sha256]\n\t" @@ -1776,9 +1784,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d3[0]\n\t" "ror %[sha256], r8, #6\n\t" "eor %[data], r9, %[len]\n\t" - "eor %[sha256], %[sha256], r8, ror 11\n\t" + "eor %[sha256], %[sha256], r8, ror #11\n\t" "and %[data], %[data], r8\n\t" - "eor %[sha256], %[sha256], r8, ror 25\n\t" + "eor %[sha256], %[sha256], r8, ror #25\n\t" "eor %[data], %[data], %[len]\n\t" "add r3, r3, %[sha256]\n\t" "add r3, r3, %[data]\n\t" @@ -1788,10 +1796,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r7, r7, r3\n\t" "ror %[sha256], r4, #2\n\t" "eor %[data], r4, r5\n\t" - "eor %[sha256], %[sha256], r4, ror 13\n\t" + "eor %[sha256], %[sha256], r4, ror #13\n\t" "eor r10, r5, r6\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r4, ror 22\n\t" + "eor %[sha256], %[sha256], r4, ror #22\n\t" "eor %[data], %[data], r5\n\t" "add r3, r3, %[sha256]\n\t" "add r3, r3, %[data]\n\t" @@ -1803,11 +1811,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d8, d2, #15\n\t" "eor %[data], r8, r9\n\t" "vsri.u32 d8, d2, #17\n\t" - "eor %[sha256], %[sha256], r7, ror 11\n\t" + "eor %[sha256], %[sha256], r7, ror #11\n\t" "vshl.u32 d9, d2, #13\n\t" "and %[data], %[data], r7\n\t" "vsri.u32 d9, d2, #19\n\t" - "eor %[sha256], %[sha256], r7, ror 25\n\t" + "eor %[sha256], %[sha256], r7, ror #25\n\t" "veor d9, d8\n\t" "eor %[data], %[data], r9\n\t" "vshr.u32 d8, d2, #10\n\t" @@ -1827,13 +1835,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d9, d10, #14\n\t" "eor %[data], r3, r4\n\t" "vsri.u32 d9, d10, #18\n\t" - "eor %[sha256], %[sha256], r3, ror 13\n\t" + "eor %[sha256], %[sha256], r3, ror #13\n\t" "veor d9, d8\n\t" "eor r10, r4, r5\n\t" "vshr.u32 d10, #3\n\t" "and %[data], %[data], r10\n\t" "veor d9, d10\n\t" - "eor %[sha256], %[sha256], r3, ror 22\n\t" + "eor %[sha256], %[sha256], r3, ror #22\n\t" "vadd.i32 d3, d9\n\t" "eor %[data], %[data], r4\n\t" "add %[len], %[len], %[sha256]\n\t" @@ -1842,9 +1850,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d4[0]\n\t" "ror %[sha256], r6, #6\n\t" "eor %[data], r7, r8\n\t" - "eor %[sha256], %[sha256], r6, ror 11\n\t" + "eor %[sha256], %[sha256], r6, ror #11\n\t" "and %[data], %[data], r6\n\t" - "eor %[sha256], %[sha256], r6, ror 25\n\t" + "eor %[sha256], %[sha256], r6, ror #25\n\t" "eor %[data], %[data], r8\n\t" "add r9, r9, %[sha256]\n\t" "add r9, r9, %[data]\n\t" @@ -1854,10 +1862,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r5, r5, r9\n\t" "ror %[sha256], %[len], #2\n\t" "eor %[data], %[len], r3\n\t" - "eor %[sha256], %[sha256], %[len], ror 13\n\t" + "eor %[sha256], %[sha256], %[len], ror #13\n\t" "eor r10, r3, r4\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], %[len], ror 22\n\t" + "eor %[sha256], %[sha256], %[len], ror #22\n\t" "eor %[data], %[data], r3\n\t" "add r9, r9, %[sha256]\n\t" "add r9, r9, %[data]\n\t" @@ -1869,11 +1877,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d8, d3, #15\n\t" "eor %[data], r6, r7\n\t" "vsri.u32 d8, d3, #17\n\t" - "eor %[sha256], %[sha256], r5, ror 11\n\t" + "eor %[sha256], %[sha256], r5, ror #11\n\t" "vshl.u32 d9, d3, #13\n\t" "and %[data], %[data], r5\n\t" "vsri.u32 d9, d3, #19\n\t" - "eor %[sha256], %[sha256], r5, ror 25\n\t" + "eor %[sha256], %[sha256], r5, ror #25\n\t" "veor d9, d8\n\t" "eor %[data], %[data], r7\n\t" "vshr.u32 d8, d3, #10\n\t" @@ -1893,13 +1901,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d9, d10, #14\n\t" "eor %[data], r9, %[len]\n\t" "vsri.u32 d9, d10, #18\n\t" - "eor %[sha256], %[sha256], r9, ror 13\n\t" + "eor %[sha256], %[sha256], r9, ror #13\n\t" "veor d9, d8\n\t" "eor r10, %[len], r3\n\t" "vshr.u32 d10, #3\n\t" "and %[data], %[data], r10\n\t" "veor d9, d10\n\t" - "eor %[sha256], %[sha256], r9, ror 22\n\t" + "eor %[sha256], %[sha256], r9, ror #22\n\t" "vadd.i32 d4, d9\n\t" "eor %[data], %[data], %[len]\n\t" "add r8, r8, %[sha256]\n\t" @@ -1908,9 +1916,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d5[0]\n\t" "ror %[sha256], r4, #6\n\t" "eor %[data], r5, r6\n\t" - "eor %[sha256], %[sha256], r4, ror 11\n\t" + "eor %[sha256], %[sha256], r4, ror #11\n\t" "and %[data], %[data], r4\n\t" - "eor %[sha256], %[sha256], r4, ror 25\n\t" + "eor %[sha256], %[sha256], r4, ror #25\n\t" "eor %[data], %[data], r6\n\t" "add r7, r7, %[sha256]\n\t" "add r7, r7, %[data]\n\t" @@ -1920,10 +1928,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r3, r3, r7\n\t" "ror %[sha256], r8, #2\n\t" "eor %[data], r8, r9\n\t" - "eor %[sha256], %[sha256], r8, ror 13\n\t" + "eor %[sha256], %[sha256], r8, ror #13\n\t" "eor r10, r9, %[len]\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r8, ror 22\n\t" + "eor %[sha256], %[sha256], r8, ror #22\n\t" "eor %[data], %[data], r9\n\t" "add r7, r7, %[sha256]\n\t" "add r7, r7, %[data]\n\t" @@ -1935,11 +1943,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d8, d4, #15\n\t" "eor %[data], r4, r5\n\t" "vsri.u32 d8, d4, #17\n\t" - "eor %[sha256], %[sha256], r3, ror 11\n\t" + "eor %[sha256], %[sha256], r3, ror #11\n\t" "vshl.u32 d9, d4, #13\n\t" "and %[data], %[data], r3\n\t" "vsri.u32 d9, d4, #19\n\t" - "eor %[sha256], %[sha256], r3, ror 25\n\t" + "eor %[sha256], %[sha256], r3, ror #25\n\t" "veor d9, d8\n\t" "eor %[data], %[data], r5\n\t" "vshr.u32 d8, d4, #10\n\t" @@ -1959,13 +1967,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d9, d10, #14\n\t" "eor %[data], r7, r8\n\t" "vsri.u32 d9, d10, #18\n\t" - "eor %[sha256], %[sha256], r7, ror 13\n\t" + "eor %[sha256], %[sha256], r7, ror #13\n\t" "veor d9, d8\n\t" "eor r10, r8, r9\n\t" "vshr.u32 d10, #3\n\t" "and %[data], %[data], r10\n\t" "veor d9, d10\n\t" - "eor %[sha256], %[sha256], r7, ror 22\n\t" + "eor %[sha256], %[sha256], r7, ror #22\n\t" "vadd.i32 d5, d9\n\t" "eor %[data], %[data], r8\n\t" "add r6, r6, %[sha256]\n\t" @@ -1974,9 +1982,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d6[0]\n\t" "ror %[sha256], %[len], #6\n\t" "eor %[data], r3, r4\n\t" - "eor %[sha256], %[sha256], %[len], ror 11\n\t" + "eor %[sha256], %[sha256], %[len], ror #11\n\t" "and %[data], %[data], %[len]\n\t" - "eor %[sha256], %[sha256], %[len], ror 25\n\t" + "eor %[sha256], %[sha256], %[len], ror #25\n\t" "eor %[data], %[data], r4\n\t" "add r5, r5, %[sha256]\n\t" "add r5, r5, %[data]\n\t" @@ -1986,10 +1994,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r9, r9, r5\n\t" "ror %[sha256], r6, #2\n\t" "eor %[data], r6, r7\n\t" - "eor %[sha256], %[sha256], r6, ror 13\n\t" + "eor %[sha256], %[sha256], r6, ror #13\n\t" "eor r10, r7, r8\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r6, ror 22\n\t" + "eor %[sha256], %[sha256], r6, ror #22\n\t" "eor %[data], %[data], r7\n\t" "add r5, r5, %[sha256]\n\t" "add r5, r5, %[data]\n\t" @@ -2001,11 +2009,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d8, d5, #15\n\t" "eor %[data], %[len], r3\n\t" "vsri.u32 d8, d5, #17\n\t" - "eor %[sha256], %[sha256], r9, ror 11\n\t" + "eor %[sha256], %[sha256], r9, ror #11\n\t" "vshl.u32 d9, d5, #13\n\t" "and %[data], %[data], r9\n\t" "vsri.u32 d9, d5, #19\n\t" - "eor %[sha256], %[sha256], r9, ror 25\n\t" + "eor %[sha256], %[sha256], r9, ror #25\n\t" "veor d9, d8\n\t" "eor %[data], %[data], r3\n\t" "vshr.u32 d8, d5, #10\n\t" @@ -2025,13 +2033,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d9, d10, #14\n\t" "eor %[data], r5, r6\n\t" "vsri.u32 d9, d10, #18\n\t" - "eor %[sha256], %[sha256], r5, ror 13\n\t" + "eor %[sha256], %[sha256], r5, ror #13\n\t" "veor d9, d8\n\t" "eor r10, r6, r7\n\t" "vshr.u32 d10, #3\n\t" "and %[data], %[data], r10\n\t" "veor d9, d10\n\t" - "eor %[sha256], %[sha256], r5, ror 22\n\t" + "eor %[sha256], %[sha256], r5, ror #22\n\t" "vadd.i32 d6, d9\n\t" "eor %[data], %[data], r6\n\t" "add r4, r4, %[sha256]\n\t" @@ -2040,9 +2048,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d7[0]\n\t" "ror %[sha256], r8, #6\n\t" "eor %[data], r9, %[len]\n\t" - "eor %[sha256], %[sha256], r8, ror 11\n\t" + "eor %[sha256], %[sha256], r8, ror #11\n\t" "and %[data], %[data], r8\n\t" - "eor %[sha256], %[sha256], r8, ror 25\n\t" + "eor %[sha256], %[sha256], r8, ror #25\n\t" "eor %[data], %[data], %[len]\n\t" "add r3, r3, %[sha256]\n\t" "add r3, r3, %[data]\n\t" @@ -2052,10 +2060,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r7, r7, r3\n\t" "ror %[sha256], r4, #2\n\t" "eor %[data], r4, r5\n\t" - "eor %[sha256], %[sha256], r4, ror 13\n\t" + "eor %[sha256], %[sha256], r4, ror #13\n\t" "eor r10, r5, r6\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r4, ror 22\n\t" + "eor %[sha256], %[sha256], r4, ror #22\n\t" "eor %[data], %[data], r5\n\t" "add r3, r3, %[sha256]\n\t" "add r3, r3, %[data]\n\t" @@ -2067,11 +2075,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d8, d6, #15\n\t" "eor %[data], r8, r9\n\t" "vsri.u32 d8, d6, #17\n\t" - "eor %[sha256], %[sha256], r7, ror 11\n\t" + "eor %[sha256], %[sha256], r7, ror #11\n\t" "vshl.u32 d9, d6, #13\n\t" "and %[data], %[data], r7\n\t" "vsri.u32 d9, d6, #19\n\t" - "eor %[sha256], %[sha256], r7, ror 25\n\t" + "eor %[sha256], %[sha256], r7, ror #25\n\t" "veor d9, d8\n\t" "eor %[data], %[data], r9\n\t" "vshr.u32 d8, d6, #10\n\t" @@ -2091,13 +2099,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vshl.u32 d9, d10, #14\n\t" "eor %[data], r3, r4\n\t" "vsri.u32 d9, d10, #18\n\t" - "eor %[sha256], %[sha256], r3, ror 13\n\t" + "eor %[sha256], %[sha256], r3, ror #13\n\t" "veor d9, d8\n\t" "eor r10, r4, r5\n\t" "vshr.u32 d10, #3\n\t" "and %[data], %[data], r10\n\t" "veor d9, d10\n\t" - "eor %[sha256], %[sha256], r3, ror 22\n\t" + "eor %[sha256], %[sha256], r3, ror #22\n\t" "vadd.i32 d7, d9\n\t" "eor %[data], %[data], r4\n\t" "add %[len], %[len], %[sha256]\n\t" @@ -2109,9 +2117,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d0[0]\n\t" "ror %[sha256], r6, #6\n\t" "eor %[data], r7, r8\n\t" - "eor %[sha256], %[sha256], r6, ror 11\n\t" + "eor %[sha256], %[sha256], r6, ror #11\n\t" "and %[data], %[data], r6\n\t" - "eor %[sha256], %[sha256], r6, ror 25\n\t" + "eor %[sha256], %[sha256], r6, ror #25\n\t" "eor %[data], %[data], r8\n\t" "add r9, r9, %[sha256]\n\t" "add r9, r9, %[data]\n\t" @@ -2121,10 +2129,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r5, r5, r9\n\t" "ror %[sha256], %[len], #2\n\t" "eor %[data], %[len], r3\n\t" - "eor %[sha256], %[sha256], %[len], ror 13\n\t" + "eor %[sha256], %[sha256], %[len], ror #13\n\t" "eor r10, r3, r4\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], %[len], ror 22\n\t" + "eor %[sha256], %[sha256], %[len], ror #22\n\t" "eor %[data], %[data], r3\n\t" "add r9, r9, %[sha256]\n\t" "add r9, r9, %[data]\n\t" @@ -2132,9 +2140,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d0[1]\n\t" "ror %[sha256], r5, #6\n\t" "eor %[data], r6, r7\n\t" - "eor %[sha256], %[sha256], r5, ror 11\n\t" + "eor %[sha256], %[sha256], r5, ror #11\n\t" "and %[data], %[data], r5\n\t" - "eor %[sha256], %[sha256], r5, ror 25\n\t" + "eor %[sha256], %[sha256], r5, ror #25\n\t" "eor %[data], %[data], r7\n\t" "add r8, r8, %[sha256]\n\t" "add r8, r8, %[data]\n\t" @@ -2144,10 +2152,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r4, r4, r8\n\t" "ror %[sha256], r9, #2\n\t" "eor %[data], r9, %[len]\n\t" - "eor %[sha256], %[sha256], r9, ror 13\n\t" + "eor %[sha256], %[sha256], r9, ror #13\n\t" "eor r10, %[len], r3\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r9, ror 22\n\t" + "eor %[sha256], %[sha256], r9, ror #22\n\t" "eor %[data], %[data], %[len]\n\t" "add r8, r8, %[sha256]\n\t" "add r8, r8, %[data]\n\t" @@ -2155,9 +2163,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d1[0]\n\t" "ror %[sha256], r4, #6\n\t" "eor %[data], r5, r6\n\t" - "eor %[sha256], %[sha256], r4, ror 11\n\t" + "eor %[sha256], %[sha256], r4, ror #11\n\t" "and %[data], %[data], r4\n\t" - "eor %[sha256], %[sha256], r4, ror 25\n\t" + "eor %[sha256], %[sha256], r4, ror #25\n\t" "eor %[data], %[data], r6\n\t" "add r7, r7, %[sha256]\n\t" "add r7, r7, %[data]\n\t" @@ -2167,10 +2175,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r3, r3, r7\n\t" "ror %[sha256], r8, #2\n\t" "eor %[data], r8, r9\n\t" - "eor %[sha256], %[sha256], r8, ror 13\n\t" + "eor %[sha256], %[sha256], r8, ror #13\n\t" "eor r10, r9, %[len]\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r8, ror 22\n\t" + "eor %[sha256], %[sha256], r8, ror #22\n\t" "eor %[data], %[data], r9\n\t" "add r7, r7, %[sha256]\n\t" "add r7, r7, %[data]\n\t" @@ -2178,9 +2186,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d1[1]\n\t" "ror %[sha256], r3, #6\n\t" "eor %[data], r4, r5\n\t" - "eor %[sha256], %[sha256], r3, ror 11\n\t" + "eor %[sha256], %[sha256], r3, ror #11\n\t" "and %[data], %[data], r3\n\t" - "eor %[sha256], %[sha256], r3, ror 25\n\t" + "eor %[sha256], %[sha256], r3, ror #25\n\t" "eor %[data], %[data], r5\n\t" "add r6, r6, %[sha256]\n\t" "add r6, r6, %[data]\n\t" @@ -2190,10 +2198,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add %[len], %[len], r6\n\t" "ror %[sha256], r7, #2\n\t" "eor %[data], r7, r8\n\t" - "eor %[sha256], %[sha256], r7, ror 13\n\t" + "eor %[sha256], %[sha256], r7, ror #13\n\t" "eor r10, r8, r9\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r7, ror 22\n\t" + "eor %[sha256], %[sha256], r7, ror #22\n\t" "eor %[data], %[data], r8\n\t" "add r6, r6, %[sha256]\n\t" "add r6, r6, %[data]\n\t" @@ -2201,9 +2209,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d2[0]\n\t" "ror %[sha256], %[len], #6\n\t" "eor %[data], r3, r4\n\t" - "eor %[sha256], %[sha256], %[len], ror 11\n\t" + "eor %[sha256], %[sha256], %[len], ror #11\n\t" "and %[data], %[data], %[len]\n\t" - "eor %[sha256], %[sha256], %[len], ror 25\n\t" + "eor %[sha256], %[sha256], %[len], ror #25\n\t" "eor %[data], %[data], r4\n\t" "add r5, r5, %[sha256]\n\t" "add r5, r5, %[data]\n\t" @@ -2213,10 +2221,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r9, r9, r5\n\t" "ror %[sha256], r6, #2\n\t" "eor %[data], r6, r7\n\t" - "eor %[sha256], %[sha256], r6, ror 13\n\t" + "eor %[sha256], %[sha256], r6, ror #13\n\t" "eor r10, r7, r8\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r6, ror 22\n\t" + "eor %[sha256], %[sha256], r6, ror #22\n\t" "eor %[data], %[data], r7\n\t" "add r5, r5, %[sha256]\n\t" "add r5, r5, %[data]\n\t" @@ -2224,9 +2232,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d2[1]\n\t" "ror %[sha256], r9, #6\n\t" "eor %[data], %[len], r3\n\t" - "eor %[sha256], %[sha256], r9, ror 11\n\t" + "eor %[sha256], %[sha256], r9, ror #11\n\t" "and %[data], %[data], r9\n\t" - "eor %[sha256], %[sha256], r9, ror 25\n\t" + "eor %[sha256], %[sha256], r9, ror #25\n\t" "eor %[data], %[data], r3\n\t" "add r4, r4, %[sha256]\n\t" "add r4, r4, %[data]\n\t" @@ -2236,10 +2244,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r8, r8, r4\n\t" "ror %[sha256], r5, #2\n\t" "eor %[data], r5, r6\n\t" - "eor %[sha256], %[sha256], r5, ror 13\n\t" + "eor %[sha256], %[sha256], r5, ror #13\n\t" "eor r10, r6, r7\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r5, ror 22\n\t" + "eor %[sha256], %[sha256], r5, ror #22\n\t" "eor %[data], %[data], r6\n\t" "add r4, r4, %[sha256]\n\t" "add r4, r4, %[data]\n\t" @@ -2247,9 +2255,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d3[0]\n\t" "ror %[sha256], r8, #6\n\t" "eor %[data], r9, %[len]\n\t" - "eor %[sha256], %[sha256], r8, ror 11\n\t" + "eor %[sha256], %[sha256], r8, ror #11\n\t" "and %[data], %[data], r8\n\t" - "eor %[sha256], %[sha256], r8, ror 25\n\t" + "eor %[sha256], %[sha256], r8, ror #25\n\t" "eor %[data], %[data], %[len]\n\t" "add r3, r3, %[sha256]\n\t" "add r3, r3, %[data]\n\t" @@ -2259,10 +2267,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r7, r7, r3\n\t" "ror %[sha256], r4, #2\n\t" "eor %[data], r4, r5\n\t" - "eor %[sha256], %[sha256], r4, ror 13\n\t" + "eor %[sha256], %[sha256], r4, ror #13\n\t" "eor r10, r5, r6\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r4, ror 22\n\t" + "eor %[sha256], %[sha256], r4, ror #22\n\t" "eor %[data], %[data], r5\n\t" "add r3, r3, %[sha256]\n\t" "add r3, r3, %[data]\n\t" @@ -2270,9 +2278,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d3[1]\n\t" "ror %[sha256], r7, #6\n\t" "eor %[data], r8, r9\n\t" - "eor %[sha256], %[sha256], r7, ror 11\n\t" + "eor %[sha256], %[sha256], r7, ror #11\n\t" "and %[data], %[data], r7\n\t" - "eor %[sha256], %[sha256], r7, ror 25\n\t" + "eor %[sha256], %[sha256], r7, ror #25\n\t" "eor %[data], %[data], r9\n\t" "add %[len], %[len], %[sha256]\n\t" "add %[len], %[len], %[data]\n\t" @@ -2282,10 +2290,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r6, r6, %[len]\n\t" "ror %[sha256], r3, #2\n\t" "eor %[data], r3, r4\n\t" - "eor %[sha256], %[sha256], r3, ror 13\n\t" + "eor %[sha256], %[sha256], r3, ror #13\n\t" "eor r10, r4, r5\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r3, ror 22\n\t" + "eor %[sha256], %[sha256], r3, ror #22\n\t" "eor %[data], %[data], r4\n\t" "add %[len], %[len], %[sha256]\n\t" "add %[len], %[len], %[data]\n\t" @@ -2293,9 +2301,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d4[0]\n\t" "ror %[sha256], r6, #6\n\t" "eor %[data], r7, r8\n\t" - "eor %[sha256], %[sha256], r6, ror 11\n\t" + "eor %[sha256], %[sha256], r6, ror #11\n\t" "and %[data], %[data], r6\n\t" - "eor %[sha256], %[sha256], r6, ror 25\n\t" + "eor %[sha256], %[sha256], r6, ror #25\n\t" "eor %[data], %[data], r8\n\t" "add r9, r9, %[sha256]\n\t" "add r9, r9, %[data]\n\t" @@ -2305,10 +2313,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r5, r5, r9\n\t" "ror %[sha256], %[len], #2\n\t" "eor %[data], %[len], r3\n\t" - "eor %[sha256], %[sha256], %[len], ror 13\n\t" + "eor %[sha256], %[sha256], %[len], ror #13\n\t" "eor r10, r3, r4\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], %[len], ror 22\n\t" + "eor %[sha256], %[sha256], %[len], ror #22\n\t" "eor %[data], %[data], r3\n\t" "add r9, r9, %[sha256]\n\t" "add r9, r9, %[data]\n\t" @@ -2316,9 +2324,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d4[1]\n\t" "ror %[sha256], r5, #6\n\t" "eor %[data], r6, r7\n\t" - "eor %[sha256], %[sha256], r5, ror 11\n\t" + "eor %[sha256], %[sha256], r5, ror #11\n\t" "and %[data], %[data], r5\n\t" - "eor %[sha256], %[sha256], r5, ror 25\n\t" + "eor %[sha256], %[sha256], r5, ror #25\n\t" "eor %[data], %[data], r7\n\t" "add r8, r8, %[sha256]\n\t" "add r8, r8, %[data]\n\t" @@ -2328,10 +2336,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r4, r4, r8\n\t" "ror %[sha256], r9, #2\n\t" "eor %[data], r9, %[len]\n\t" - "eor %[sha256], %[sha256], r9, ror 13\n\t" + "eor %[sha256], %[sha256], r9, ror #13\n\t" "eor r10, %[len], r3\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r9, ror 22\n\t" + "eor %[sha256], %[sha256], r9, ror #22\n\t" "eor %[data], %[data], %[len]\n\t" "add r8, r8, %[sha256]\n\t" "add r8, r8, %[data]\n\t" @@ -2339,9 +2347,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d5[0]\n\t" "ror %[sha256], r4, #6\n\t" "eor %[data], r5, r6\n\t" - "eor %[sha256], %[sha256], r4, ror 11\n\t" + "eor %[sha256], %[sha256], r4, ror #11\n\t" "and %[data], %[data], r4\n\t" - "eor %[sha256], %[sha256], r4, ror 25\n\t" + "eor %[sha256], %[sha256], r4, ror #25\n\t" "eor %[data], %[data], r6\n\t" "add r7, r7, %[sha256]\n\t" "add r7, r7, %[data]\n\t" @@ -2351,10 +2359,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r3, r3, r7\n\t" "ror %[sha256], r8, #2\n\t" "eor %[data], r8, r9\n\t" - "eor %[sha256], %[sha256], r8, ror 13\n\t" + "eor %[sha256], %[sha256], r8, ror #13\n\t" "eor r10, r9, %[len]\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r8, ror 22\n\t" + "eor %[sha256], %[sha256], r8, ror #22\n\t" "eor %[data], %[data], r9\n\t" "add r7, r7, %[sha256]\n\t" "add r7, r7, %[data]\n\t" @@ -2362,9 +2370,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d5[1]\n\t" "ror %[sha256], r3, #6\n\t" "eor %[data], r4, r5\n\t" - "eor %[sha256], %[sha256], r3, ror 11\n\t" + "eor %[sha256], %[sha256], r3, ror #11\n\t" "and %[data], %[data], r3\n\t" - "eor %[sha256], %[sha256], r3, ror 25\n\t" + "eor %[sha256], %[sha256], r3, ror #25\n\t" "eor %[data], %[data], r5\n\t" "add r6, r6, %[sha256]\n\t" "add r6, r6, %[data]\n\t" @@ -2374,10 +2382,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add %[len], %[len], r6\n\t" "ror %[sha256], r7, #2\n\t" "eor %[data], r7, r8\n\t" - "eor %[sha256], %[sha256], r7, ror 13\n\t" + "eor %[sha256], %[sha256], r7, ror #13\n\t" "eor r10, r8, r9\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r7, ror 22\n\t" + "eor %[sha256], %[sha256], r7, ror #22\n\t" "eor %[data], %[data], r8\n\t" "add r6, r6, %[sha256]\n\t" "add r6, r6, %[data]\n\t" @@ -2385,9 +2393,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d6[0]\n\t" "ror %[sha256], %[len], #6\n\t" "eor %[data], r3, r4\n\t" - "eor %[sha256], %[sha256], %[len], ror 11\n\t" + "eor %[sha256], %[sha256], %[len], ror #11\n\t" "and %[data], %[data], %[len]\n\t" - "eor %[sha256], %[sha256], %[len], ror 25\n\t" + "eor %[sha256], %[sha256], %[len], ror #25\n\t" "eor %[data], %[data], r4\n\t" "add r5, r5, %[sha256]\n\t" "add r5, r5, %[data]\n\t" @@ -2397,10 +2405,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r9, r9, r5\n\t" "ror %[sha256], r6, #2\n\t" "eor %[data], r6, r7\n\t" - "eor %[sha256], %[sha256], r6, ror 13\n\t" + "eor %[sha256], %[sha256], r6, ror #13\n\t" "eor r10, r7, r8\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r6, ror 22\n\t" + "eor %[sha256], %[sha256], r6, ror #22\n\t" "eor %[data], %[data], r7\n\t" "add r5, r5, %[sha256]\n\t" "add r5, r5, %[data]\n\t" @@ -2408,9 +2416,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d6[1]\n\t" "ror %[sha256], r9, #6\n\t" "eor %[data], %[len], r3\n\t" - "eor %[sha256], %[sha256], r9, ror 11\n\t" + "eor %[sha256], %[sha256], r9, ror #11\n\t" "and %[data], %[data], r9\n\t" - "eor %[sha256], %[sha256], r9, ror 25\n\t" + "eor %[sha256], %[sha256], r9, ror #25\n\t" "eor %[data], %[data], r3\n\t" "add r4, r4, %[sha256]\n\t" "add r4, r4, %[data]\n\t" @@ -2420,10 +2428,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r8, r8, r4\n\t" "ror %[sha256], r5, #2\n\t" "eor %[data], r5, r6\n\t" - "eor %[sha256], %[sha256], r5, ror 13\n\t" + "eor %[sha256], %[sha256], r5, ror #13\n\t" "eor r10, r6, r7\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r5, ror 22\n\t" + "eor %[sha256], %[sha256], r5, ror #22\n\t" "eor %[data], %[data], r6\n\t" "add r4, r4, %[sha256]\n\t" "add r4, r4, %[data]\n\t" @@ -2431,9 +2439,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d7[0]\n\t" "ror %[sha256], r8, #6\n\t" "eor %[data], r9, %[len]\n\t" - "eor %[sha256], %[sha256], r8, ror 11\n\t" + "eor %[sha256], %[sha256], r8, ror #11\n\t" "and %[data], %[data], r8\n\t" - "eor %[sha256], %[sha256], r8, ror 25\n\t" + "eor %[sha256], %[sha256], r8, ror #25\n\t" "eor %[data], %[data], %[len]\n\t" "add r3, r3, %[sha256]\n\t" "add r3, r3, %[data]\n\t" @@ -2443,10 +2451,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r7, r7, r3\n\t" "ror %[sha256], r4, #2\n\t" "eor %[data], r4, r5\n\t" - "eor %[sha256], %[sha256], r4, ror 13\n\t" + "eor %[sha256], %[sha256], r4, ror #13\n\t" "eor r10, r5, r6\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r4, ror 22\n\t" + "eor %[sha256], %[sha256], r4, ror #22\n\t" "eor %[data], %[data], r5\n\t" "add r3, r3, %[sha256]\n\t" "add r3, r3, %[data]\n\t" @@ -2454,9 +2462,9 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "vmov r10, d7[1]\n\t" "ror %[sha256], r7, #6\n\t" "eor %[data], r8, r9\n\t" - "eor %[sha256], %[sha256], r7, ror 11\n\t" + "eor %[sha256], %[sha256], r7, ror #11\n\t" "and %[data], %[data], r7\n\t" - "eor %[sha256], %[sha256], r7, ror 25\n\t" + "eor %[sha256], %[sha256], r7, ror #25\n\t" "eor %[data], %[data], r9\n\t" "add %[len], %[len], %[sha256]\n\t" "add %[len], %[len], %[data]\n\t" @@ -2466,10 +2474,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "add r6, r6, %[len]\n\t" "ror %[sha256], r3, #2\n\t" "eor %[data], r3, r4\n\t" - "eor %[sha256], %[sha256], r3, ror 13\n\t" + "eor %[sha256], %[sha256], r3, ror #13\n\t" "eor r10, r4, r5\n\t" "and %[data], %[data], r10\n\t" - "eor %[sha256], %[sha256], r3, ror 22\n\t" + "eor %[sha256], %[sha256], r3, ror #22\n\t" "eor %[data], %[data], r4\n\t" "add %[len], %[len], %[sha256]\n\t" "add %[len], %[len], %[data]\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index 5c17f0151..f08e72796 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -120,9 +120,13 @@ static const uint64_t L_SHA512_transform_len_k[] = { 0x6c44198c4a475817UL, }; -void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); -void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p); +void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) { + register wc_Sha512* sha512 asm ("r0") = sha512_p; + register const byte* data asm ("r1") = data_p; + register word32 len asm ("r2") = len_p; + __asm__ __volatile__ ( "sub sp, sp, #0xc0\n\t" "mov r3, %[L_SHA512_transform_len_k]\n\t" @@ -240,18 +244,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #32]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #56]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -284,18 +288,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #24]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #56]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -320,17 +324,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #112]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp]\n\t" @@ -343,17 +347,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #8]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp]\n\t" @@ -364,18 +368,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #24]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #48]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -408,18 +412,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #16]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #48]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -444,17 +448,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #120]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #8]\n\t" @@ -467,17 +471,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #16]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #8]\n\t" @@ -488,18 +492,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #16]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #40]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -532,18 +536,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #8]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #40]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -568,17 +572,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #16]\n\t" @@ -591,17 +595,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #24]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #16]\n\t" @@ -612,18 +616,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #8]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #32]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -656,18 +660,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512]]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #32]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -692,17 +696,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #8]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #24]\n\t" @@ -715,17 +719,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #32]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #24]\n\t" @@ -736,18 +740,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512]]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #24]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -780,18 +784,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #56]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #24]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -816,17 +820,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #16]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #32]\n\t" @@ -839,17 +843,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #40]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #32]\n\t" @@ -860,18 +864,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #56]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #16]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -904,18 +908,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #48]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #16]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -940,17 +944,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #24]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #40]\n\t" @@ -963,17 +967,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #48]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #40]\n\t" @@ -984,18 +988,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #48]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #8]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1028,18 +1032,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #40]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #8]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1064,17 +1068,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #32]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #48]\n\t" @@ -1087,17 +1091,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #56]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #48]\n\t" @@ -1108,18 +1112,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #40]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512]]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1152,18 +1156,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #32]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512]]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1188,17 +1192,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #40]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #56]\n\t" @@ -1211,17 +1215,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #64]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #56]\n\t" @@ -1232,18 +1236,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #32]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #56]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1276,18 +1280,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #24]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #56]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1312,17 +1316,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #48]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #64]\n\t" @@ -1335,17 +1339,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #72]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #64]\n\t" @@ -1356,18 +1360,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #24]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #48]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1400,18 +1404,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #16]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #48]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1436,17 +1440,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #56]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #72]\n\t" @@ -1459,17 +1463,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #80]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #72]\n\t" @@ -1480,18 +1484,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #16]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #40]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1524,18 +1528,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #8]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #40]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1560,17 +1564,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #64]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #80]\n\t" @@ -1583,17 +1587,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #88]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #80]\n\t" @@ -1604,18 +1608,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #8]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #32]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1648,18 +1652,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512]]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #32]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1684,17 +1688,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #72]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #88]\n\t" @@ -1707,17 +1711,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #96]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #88]\n\t" @@ -1728,18 +1732,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512]]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #24]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1772,18 +1776,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #56]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #24]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1808,17 +1812,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #80]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #96]\n\t" @@ -1831,17 +1835,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #104]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #96]\n\t" @@ -1852,18 +1856,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #56]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #16]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1896,18 +1900,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #48]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #16]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -1932,17 +1936,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #88]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #104]\n\t" @@ -1955,17 +1959,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #112]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #104]\n\t" @@ -1976,18 +1980,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #48]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #8]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2020,18 +2024,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #40]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #8]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2056,17 +2060,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #96]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #112]\n\t" @@ -2079,17 +2083,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #120]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #112]\n\t" @@ -2100,18 +2104,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #40]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512]]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2144,18 +2148,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #32]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512]]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2180,17 +2184,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp, #104]\n\t" "lsrs r4, r12, #19\n\t" "lsrs r5, lr, #19\n\t" - "orr r5, r5, r12, lsl 13\n\t" - "orr r4, r4, lr, lsl 13\n\t" + "orr r5, r5, r12, lsl #13\n\t" + "orr r4, r4, lr, lsl #13\n\t" "lsls r6, r12, #3\n\t" "lsls r7, lr, #3\n\t" - "orr r7, r7, r12, lsr 29\n\t" - "orr r6, r6, lr, lsr 29\n\t" + "orr r7, r7, r12, lsr #29\n\t" + "orr r6, r6, lr, lsr #29\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #6\n\t" "lsrs r7, lr, #6\n\t" - "orr r6, r6, lr, lsl 26\n\t" + "orr r6, r6, lr, lsl #26\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #120]\n\t" @@ -2203,17 +2207,17 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [sp]\n\t" "lsrs r4, r12, #1\n\t" "lsrs r5, lr, #1\n\t" - "orr r5, r5, r12, lsl 31\n\t" - "orr r4, r4, lr, lsl 31\n\t" + "orr r5, r5, r12, lsl #31\n\t" + "orr r4, r4, lr, lsl #31\n\t" "lsrs r6, r12, #8\n\t" "lsrs r7, lr, #8\n\t" - "orr r7, r7, r12, lsl 24\n\t" - "orr r6, r6, lr, lsl 24\n\t" + "orr r7, r7, r12, lsl #24\n\t" + "orr r6, r6, lr, lsl #24\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "lsrs r6, r12, #7\n\t" "lsrs r7, lr, #7\n\t" - "orr r6, r6, lr, lsl 25\n\t" + "orr r6, r6, lr, lsl #25\n\t" "eor r5, r5, r7\n\t" "eor r4, r4, r6\n\t" "ldrd r12, lr, [sp, #120]\n\t" @@ -2227,18 +2231,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #32]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #56]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2271,18 +2275,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #24]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #56]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2307,18 +2311,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #24]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #48]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2351,18 +2355,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #16]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #48]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2387,18 +2391,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #16]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #40]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2431,18 +2435,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #8]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #40]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2467,18 +2471,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #8]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #32]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2511,18 +2515,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512]]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #32]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2547,18 +2551,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512]]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #24]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2591,18 +2595,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #56]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #24]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2627,18 +2631,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #56]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #16]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2671,18 +2675,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #48]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #16]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2707,18 +2711,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #48]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #8]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2751,18 +2755,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #40]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #8]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2787,18 +2791,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #40]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512]]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2831,18 +2835,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #32]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512]]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2867,18 +2871,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #32]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #56]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2911,18 +2915,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #24]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #56]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2947,18 +2951,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #24]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #48]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -2991,18 +2995,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #16]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #48]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3027,18 +3031,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #16]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #40]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3071,18 +3075,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #8]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #40]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3107,18 +3111,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #8]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #32]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3151,18 +3155,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512]]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #32]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3187,18 +3191,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512]]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #24]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3231,18 +3235,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #56]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #24]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3267,18 +3271,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #56]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #16]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3311,18 +3315,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #48]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #16]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3347,18 +3351,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #48]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512], #8]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3391,18 +3395,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #40]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512], #8]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3427,18 +3431,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "ldrd r12, lr, [%[sha512], #40]\n\t" "lsrs r4, r12, #14\n\t" "lsrs r5, lr, #14\n\t" - "orr r5, r5, r12, lsl 18\n\t" - "orr r4, r4, lr, lsl 18\n\t" + "orr r5, r5, r12, lsl #18\n\t" + "orr r4, r4, lr, lsl #18\n\t" "lsrs r6, r12, #18\n\t" "lsrs r7, lr, #18\n\t" - "orr r7, r7, r12, lsl 14\n\t" - "orr r6, r6, lr, lsl 14\n\t" + "orr r7, r7, r12, lsl #14\n\t" + "orr r6, r6, lr, lsl #14\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #23\n\t" "lsls r7, lr, #23\n\t" - "orr r7, r7, r12, lsr 9\n\t" - "orr r6, r6, lr, lsr 9\n\t" + "orr r7, r7, r12, lsr #9\n\t" + "orr r6, r6, lr, lsr #9\n\t" "ldrd r12, lr, [%[sha512]]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3471,18 +3475,18 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "strd r6, r7, [%[sha512], #32]\n\t" "lsrs r4, r12, #28\n\t" "lsrs r5, lr, #28\n\t" - "orr r5, r5, r12, lsl 4\n\t" - "orr r4, r4, lr, lsl 4\n\t" + "orr r5, r5, r12, lsl #4\n\t" + "orr r4, r4, lr, lsl #4\n\t" "lsls r6, r12, #30\n\t" "lsls r7, lr, #30\n\t" - "orr r7, r7, r12, lsr 2\n\t" - "orr r6, r6, lr, lsr 2\n\t" + "orr r7, r7, r12, lsr #2\n\t" + "orr r6, r6, lr, lsr #2\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" "lsls r6, r12, #25\n\t" "lsls r7, lr, #25\n\t" - "orr r7, r7, r12, lsr 7\n\t" - "orr r6, r6, lr, lsr 7\n\t" + "orr r7, r7, r12, lsr #7\n\t" + "orr r6, r6, lr, lsr #7\n\t" "ldrd r12, lr, [%[sha512]]\n\t" "eor r4, r4, r6\n\t" "eor r5, r5, r7\n\t" @@ -3651,9 +3655,13 @@ static const uint64_t L_SHA512_transform_neon_len_k[] = { 0x6c44198c4a475817UL, }; -void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); -void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p); +void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) { + register wc_Sha512* sha512 asm ("r0") = sha512_p; + register const byte* data asm ("r1") = data_p; + register word32 len asm ("r2") = len_p; + __asm__ __volatile__ ( /* Load digest into working vars */ "vldm.64 %[sha512], {d0-d7}\n\t" From 1b9656f72db24fdfab2d0f151f77b534289c7118 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 21 Sep 2022 11:57:01 +1000 Subject: [PATCH 9/9] ARM ASM: add fpu directive to assembly files --- wolfcrypt/src/port/arm/armv8-32-sha256-asm.S | 4 +++- wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c | 3 ++- wolfcrypt/src/port/arm/armv8-32-sha512-asm.S | 6 +++++- wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c | 5 ++++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S index e705558a9..6814bdd4d 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S @@ -1541,6 +1541,7 @@ L_SHA256_transform_neon_len_k: .word 0xc67178f2 .text .align 2 + .fpu neon .globl Transform_Sha256_Len .type Transform_Sha256_Len, %function Transform_Sha256_Len: @@ -1558,7 +1559,8 @@ Transform_Sha256_Len: # Start of loop processing a block L_SHA256_transform_neon_len_begin: # Load W - vldm.32 r1!, {d0-d7} + vld1.8 {d0-d3}, [r1]! + vld1.8 {d4-d7}, [r1]! #ifndef WOLFSSL_ARM_ARCH_NEON_64BIT vrev32.8 q0, q0 vrev32.8 q1, q1 diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c index 22b1331fa..e211af660 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -1561,7 +1561,8 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "\n" "L_SHA256_transform_neon_len_begin_%=: \n\t" /* Load W */ - "vldm.32 %[data]!, {d0-d7}\n\t" + "vld1.8 {d0-d3}, [%[data]]!\n\t" + "vld1.8 {d4-d7}, [%[data]]!\n\t" #ifndef WOLFSSL_ARM_ARCH_NEON_64BIT "vrev32.8 q0, q0\n\t" "vrev32.8 q1, q1\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S index 7d4dcdc26..0dd0c6b36 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -4209,6 +4209,7 @@ L_SHA512_transform_neon_len_k: .word 0x6c44198c .text .align 2 + .fpu neon .globl Transform_Sha512_Len .type Transform_Sha512_Len, %function Transform_Sha512_Len: @@ -4218,7 +4219,10 @@ Transform_Sha512_Len: # Start of loop processing a block L_SHA512_transform_neon_len_begin: # Load W - vldm.64 r1!, {d16-d31} + vld1.8 {q8, q9}, [r1]! + vld1.8 {q10, q11}, [r1]! + vld1.8 {q12, q13}, [r1]! + vld1.8 {q14, q15}, [r1]! #ifndef WOLFSSL_ARM_ARCH_NEON_64BIT vrev64.8 q8, q8 vrev64.8 q9, q9 diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index f08e72796..ef1b2b1d4 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -3669,7 +3669,10 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "\n" "L_SHA512_transform_neon_len_begin_%=: \n\t" /* Load W */ - "vldm.64 %[data]!, {d16-d31}\n\t" + "vld1.8 {q8-q9}, [%[data]]!\n\t" + "vld1.8 {q10-q11}, [%[data]]!\n\t" + "vld1.8 {q12-q13}, [%[data]]!\n\t" + "vld1.8 {q14-q15}, [%[data]]!\n\t" #ifndef WOLFSSL_ARM_ARCH_NEON_64BIT "vrev64.8 q8, q8\n\t" "vrev64.8 q9, q9\n\t"