diff --git a/src/include.am b/src/include.am index 2eb5697e7..f0307859c 100644 --- a/src/include.am +++ b/src/include.am @@ -236,6 +236,7 @@ if BUILD_SHA512 if BUILD_ARMASM src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm.S +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S else src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512.c if BUILD_INTELASM @@ -387,7 +388,7 @@ if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S else if BUILD_ARMASM -src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv7-curve25519.S +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519.S src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S else src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index ba1f7b6a7..5fa0b019d 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -52,8 +52,9 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \ wolfcrypt/src/port/arm/armv8-sha256.c \ wolfcrypt/src/port/arm/armv8-chacha.c \ wolfcrypt/src/port/arm/armv8-curve25519.c \ - wolfcrypt/src/port/arm/armv7-curve25519.c \ + wolfcrypt/src/port/arm/armv8-32-curve25519.c \ wolfcrypt/src/port/arm/armv8-sha512-asm.c \ + wolfcrypt/src/port/arm/armv8-32-sha512-asm.c \ wolfcrypt/src/port/nxp/ksdk_port.c \ wolfcrypt/src/port/atmel/README.md \ wolfcrypt/src/port/xilinx/xil-sha3.c \ diff --git a/wolfcrypt/src/port/arm/armv7-curve25519.c b/wolfcrypt/src/port/arm/armv7-curve25519.c deleted file mode 100644 index 15c0cdf86..000000000 --- a/wolfcrypt/src/port/arm/armv7-curve25519.c +++ /dev/null @@ -1,5623 +0,0 @@ -/* armv7-curve25519 - * - * Copyright (C) 2006-2019 wolfSSL Inc. - * - * This file is part of wolfSSL. - * - * wolfSSL is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * wolfSSL is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA - */ - -#ifndef __aarch64__ -#ifdef HAVE_CONFIG_H - #include -#endif - -#include -#include -#include - -void fe_init() -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - "\n\t" - "add sp, sp, #0\n\t" - : - : - : "memory" - ); -} - -void fe_frombytes(fe out, const unsigned char* in) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - "ldrd r2, r3, [r1]\n\t" - "ldrd r12, r4, [r1, #8]\n\t" - "ldrd r5, r6, [r1, #16]\n\t" - "ldrd r7, r8, [r1, #24]\n\t" - "and r8, r8, #0x7fffffff\n\t" - "strd r2, r3, [r0]\n\t" - "strd r12, r4, [r0, #8]\n\t" - "strd r5, r6, [r0, #16]\n\t" - "strd r7, r8, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [out] "+r" (out), [in] "+r" (in) - : - : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8" - ); -} - -void fe_tobytes(unsigned char* out, const fe n) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - "ldrd r2, r3, [r1]\n\t" - "ldrd r12, r4, [r1, #8]\n\t" - "ldrd r5, r6, [r1, #16]\n\t" - "ldrd r7, r8, [r1, #24]\n\t" - "adds r9, r2, #19\n\t" - "adcs r9, r3, #0\n\t" - "adcs r9, r12, #0\n\t" - "adcs r9, r4, #0\n\t" - "adcs r9, r5, #0\n\t" - "adcs r9, r6, #0\n\t" - "adcs r9, r7, #0\n\t" - "adc r9, r8, #0\n\t" - "asr r9, r9, #31\n\t" - "and r9, r9, #19\n\t" - "adds r2, r2, r9\n\t" - "adcs r3, r3, #0\n\t" - "adcs r12, r12, #0\n\t" - "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adc r8, r8, #0\n\t" - "and r8, r8, #0x7fffffff\n\t" - "strd r2, r3, [r0]\n\t" - "strd r12, r4, [r0, #8]\n\t" - "strd r5, r6, [r0, #16]\n\t" - "strd r7, r8, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [out] "+r" (out), [n] "+r" (n) - : - : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9" - ); -} - -void fe_1(fe n) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - /* Set one */ - "mov r2, #1\n\t" - "mov r1, #0\n\t" - "strd r2, r1, [r0]\n\t" - "strd r1, r1, [r0, #8]\n\t" - "strd r1, r1, [r0, #16]\n\t" - "strd r1, r1, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [n] "+r" (n) - : - : "memory", "r1", "r2" - ); -} - -void fe_0(fe n) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - /* Set zero */ - "mov r1, #0\n\t" - "strd r1, r1, [r0]\n\t" - "strd r1, r1, [r0, #8]\n\t" - "strd r1, r1, [r0, #16]\n\t" - "strd r1, r1, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [n] "+r" (n) - : - : "memory", "r1" - ); -} - -void fe_copy(fe r, const fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - /* Copy */ - "ldrd r2, r3, [r1]\n\t" - "ldrd r12, r4, [r1, #8]\n\t" - "strd r2, r3, [r0]\n\t" - "strd r12, r4, [r0, #8]\n\t" - "ldrd r2, r3, [r1, #16]\n\t" - "ldrd r12, r4, [r1, #24]\n\t" - "strd r2, r3, [r0, #16]\n\t" - "strd r12, r4, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "r2", "r3", "r12", "r4" - ); -} - -void fe_sub(fe r, const fe a, const fe b) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - /* Sub */ - "ldrd r12, r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, r12, r7\n\t" - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd r12, r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, r12, r7\n\t" - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" - "mov r11, #-19\n\t" - "asr r3, r10, #31\n\t" - /* Mask the modulus */ - "and r11, r3, r11\n\t" - "and lr, r3, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd r12, r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" - "adds r12, r12, r11\n\t" - "adcs r4, r4, r3\n\t" - "adcs r5, r5, r3\n\t" - "adcs r6, r6, r3\n\t" - "adcs r7, r7, r3\n\t" - "adcs r8, r8, r3\n\t" - "adcs r9, r9, r3\n\t" - "adc r10, r10, lr\n\t" - "strd r12, r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); -} - -void fe_add(fe r, const fe a, const fe b) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - /* Add */ - "ldrd r12, r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, r12, r7\n\t" - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd r12, r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, r12, r7\n\t" - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" - "mov r11, #-19\n\t" - "asr r3, r10, #31\n\t" - /* Mask the modulus */ - "and r11, r3, r11\n\t" - "and lr, r3, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd r12, r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" - "subs r12, r12, r11\n\t" - "sbcs r4, r4, r3\n\t" - "sbcs r5, r5, r3\n\t" - "sbcs r6, r6, r3\n\t" - "sbcs r7, r7, r3\n\t" - "sbcs r8, r8, r3\n\t" - "sbcs r9, r9, r3\n\t" - "sbc r10, r10, lr\n\t" - "strd r12, r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); -} - -void fe_neg(fe r, const fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - "mov r6, #-1\n\t" - "mov r5, #-19\n\t" - "ldrd r2, r3, [r1]\n\t" - "ldrd r12, r4, [r1, #8]\n\t" - "subs r2, r5, r2\n\t" - "sbcs r3, r6, r3\n\t" - "sbcs r12, r6, r12\n\t" - "sbcs r4, r6, r4\n\t" - "strd r2, r3, [r0]\n\t" - "strd r12, r4, [r0, #8]\n\t" - "mov r5, #0x7fffffff\n\t" - "ldrd r2, r3, [r1, #16]\n\t" - "ldrd r12, r4, [r1, #24]\n\t" - "sbcs r2, r6, r2\n\t" - "sbcs r3, r6, r3\n\t" - "sbcs r12, r6, r12\n\t" - "sbc r4, r5, r4\n\t" - "strd r2, r3, [r0, #16]\n\t" - "strd r12, r4, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "r2", "r3", "r12", "r4", "r5", "r6" - ); -} - -int fe_isnonzero(const fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - "ldrd r2, r3, [r0]\n\t" - "ldrd r12, r4, [r0, #8]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" - "ldrd r7, r8, [r0, #24]\n\t" - "adds r1, r2, #19\n\t" - "adcs r1, r3, #0\n\t" - "adcs r1, r12, #0\n\t" - "adcs r1, r4, #0\n\t" - "adcs r1, r5, #0\n\t" - "adcs r1, r6, #0\n\t" - "adcs r1, r7, #0\n\t" - "adc r1, r8, #0\n\t" - "asr r1, r1, #31\n\t" - "and r1, r1, #19\n\t" - "adds r2, r2, r1\n\t" - "adcs r3, r3, #0\n\t" - "adcs r12, r12, #0\n\t" - "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adc r8, r8, #0\n\t" - "and r8, r8, #0x7fffffff\n\t" - "orr r2, r2, r3\n\t" - "orr r12, r12, r4\n\t" - "orr r5, r5, r6\n\t" - "orr r7, r7, r8\n\t" - "orr r12, r12, r5\n\t" - "orr r2, r2, r7\n\t" - "orr %[a], r2, r12\n\t" - "add sp, sp, #0\n\t" - : [a] "+r" (a) - : - : "memory", "r1", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9" - ); - return (uint32_t)(size_t)a; -} - -int fe_isnegative(const fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - "ldrd r2, r3, [r0]\n\t" - "ldrd r12, r4, [r0, #8]\n\t" - "adds r1, r2, #19\n\t" - "adcs r1, r3, #0\n\t" - "adcs r1, r12, #0\n\t" - "adcs r1, r4, #0\n\t" - "ldrd r2, r3, [r0, #16]\n\t" - "ldrd r12, r4, [r0, #24]\n\t" - "adcs r1, r2, #0\n\t" - "adcs r1, r3, #0\n\t" - "adcs r1, r12, #0\n\t" - "ldr r2, [r0]\n\t" - "adc r1, r4, #0\n\t" - "and %[a], r2, #1\n\t" - "lsr r1, r1, #31\n\t" - "eor %[a], %[a], r1\n\t" - "add sp, sp, #0\n\t" - : [a] "+r" (a) - : - : "memory", "r1", "r2", "r3", "r12", "r4" - ); - return (uint32_t)(size_t)a; -} - -void fe_cmov_table(fe* r, fe* base, signed char b) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - "sxtb %[b], %[b]\n\t" - "sbfx r8, %[b], #7, #1\n\t" - "eor r11, %[b], r8\n\t" - "sub r11, r11, r8\n\t" - "mov r3, #1\n\t" - "mov r12, #0\n\t" - "mov r4, #1\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" - "mov r7, #0\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #31\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #32]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #64]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #30\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #32]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #64]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #29\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #32]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #64]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #28\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #32]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #64]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #27\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #32]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #64]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #26\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #32]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #64]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #25\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #32]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #64]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #24\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #32]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #64]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "sub %[base], %[base], #0x2a0\n\t" - "mov r9, #-19\n\t" - "mov r10, #-1\n\t" - "subs r9, r9, r6\n\t" - "sbcs r10, r10, r7\n\t" - "sbc lr, lr, lr\n\t" - "asr r11, %[b], #31\n\t" - "eor r8, r3, r4\n\t" - "and r8, r8, r11\n\t" - "eor r3, r3, r8\n\t" - "eor r4, r4, r8\n\t" - "eor r8, r12, r5\n\t" - "and r8, r8, r11\n\t" - "eor r12, r12, r8\n\t" - "eor r5, r5, r8\n\t" - "eor r9, r9, r6\n\t" - "and r9, r9, r11\n\t" - "eor r6, r6, r9\n\t" - "eor r10, r10, r7\n\t" - "and r10, r10, r11\n\t" - "eor r7, r7, r10\n\t" - "strd r3, r12, [r0]\n\t" - "strd r4, r5, [r0, #32]\n\t" - "strd r6, r7, [r0, #64]\n\t" - "sbfx r8, %[b], #7, #1\n\t" - "eor r11, %[b], r8\n\t" - "sub r11, r11, r8\n\t" - "mov r3, #0\n\t" - "mov r12, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" - "mov r7, #0\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #31\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #40]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #72]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #30\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #40]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #72]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #29\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #40]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #72]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #28\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #40]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #72]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #27\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #40]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #72]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #26\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #40]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #72]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #25\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #40]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #72]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #24\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #40]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #72]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "sub %[base], %[base], #0x2a0\n\t" - "mov r9, #-1\n\t" - "mov r10, #-1\n\t" - "rsbs lr, lr, #0\n\t" - "sbcs r9, r9, r6\n\t" - "sbcs r10, r10, r7\n\t" - "sbc lr, lr, lr\n\t" - "asr r11, %[b], #31\n\t" - "eor r8, r3, r4\n\t" - "and r8, r8, r11\n\t" - "eor r3, r3, r8\n\t" - "eor r4, r4, r8\n\t" - "eor r8, r12, r5\n\t" - "and r8, r8, r11\n\t" - "eor r12, r12, r8\n\t" - "eor r5, r5, r8\n\t" - "eor r9, r9, r6\n\t" - "and r9, r9, r11\n\t" - "eor r6, r6, r9\n\t" - "eor r10, r10, r7\n\t" - "and r10, r10, r11\n\t" - "eor r7, r7, r10\n\t" - "strd r3, r12, [r0, #8]\n\t" - "strd r4, r5, [r0, #40]\n\t" - "strd r6, r7, [r0, #72]\n\t" - "sbfx r8, %[b], #7, #1\n\t" - "eor r11, %[b], r8\n\t" - "sub r11, r11, r8\n\t" - "mov r3, #0\n\t" - "mov r12, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" - "mov r7, #0\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #31\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #16]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #48]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #80]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #30\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #16]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #48]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #80]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #29\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #16]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #48]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #80]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #28\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #16]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #48]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #80]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #27\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #16]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #48]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #80]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #26\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #16]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #48]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #80]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #25\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #16]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #48]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #80]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #24\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #16]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #48]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #80]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "sub %[base], %[base], #0x2a0\n\t" - "mov r9, #-1\n\t" - "mov r10, #-1\n\t" - "rsbs lr, lr, #0\n\t" - "sbcs r9, r9, r6\n\t" - "sbcs r10, r10, r7\n\t" - "sbc lr, lr, lr\n\t" - "asr r11, %[b], #31\n\t" - "eor r8, r3, r4\n\t" - "and r8, r8, r11\n\t" - "eor r3, r3, r8\n\t" - "eor r4, r4, r8\n\t" - "eor r8, r12, r5\n\t" - "and r8, r8, r11\n\t" - "eor r12, r12, r8\n\t" - "eor r5, r5, r8\n\t" - "eor r9, r9, r6\n\t" - "and r9, r9, r11\n\t" - "eor r6, r6, r9\n\t" - "eor r10, r10, r7\n\t" - "and r10, r10, r11\n\t" - "eor r7, r7, r10\n\t" - "strd r3, r12, [r0, #16]\n\t" - "strd r4, r5, [r0, #48]\n\t" - "strd r6, r7, [r0, #80]\n\t" - "sbfx r8, %[b], #7, #1\n\t" - "eor r11, %[b], r8\n\t" - "sub r11, r11, r8\n\t" - "mov r3, #0\n\t" - "mov r12, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" - "mov r7, #0\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #31\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #56]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #88]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #30\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #56]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #88]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #29\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #56]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #88]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #28\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #56]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #88]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #27\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #56]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #88]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #26\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #56]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #88]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #25\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #56]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #88]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "add %[base], %[base], #0x60\n\t" - "mov r8, #0x80000000\n\t" - "ror r8, r8, #24\n\t" - "ror r8, r8, r11\n\t" - "asr r8, r8, #31\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "eor r9, r9, r3\n\t" - "eor r10, r10, r12\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r3, r3, r9\n\t" - "eor r12, r12, r10\n\t" - "ldrd r9, r10, [r1, #56]\n\t" - "eor r9, r9, r4\n\t" - "eor r10, r10, r5\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r4, r4, r9\n\t" - "eor r5, r5, r10\n\t" - "ldrd r9, r10, [r1, #88]\n\t" - "eor r9, r9, r6\n\t" - "eor r10, r10, r7\n\t" - "and r9, r9, r8\n\t" - "and r10, r10, r8\n\t" - "eor r6, r6, r9\n\t" - "eor r7, r7, r10\n\t" - "sub %[base], %[base], #0x2a0\n\t" - "mov r9, #-1\n\t" - "mov r10, #0x7fffffff\n\t" - "rsbs lr, lr, #0\n\t" - "sbcs r9, r9, r6\n\t" - "sbc r10, r10, r7\n\t" - "asr r11, %[b], #31\n\t" - "eor r8, r3, r4\n\t" - "and r8, r8, r11\n\t" - "eor r3, r3, r8\n\t" - "eor r4, r4, r8\n\t" - "eor r8, r12, r5\n\t" - "and r8, r8, r11\n\t" - "eor r12, r12, r8\n\t" - "eor r5, r5, r8\n\t" - "eor r9, r9, r6\n\t" - "and r9, r9, r11\n\t" - "eor r6, r6, r9\n\t" - "eor r10, r10, r7\n\t" - "and r10, r10, r11\n\t" - "eor r7, r7, r10\n\t" - "strd r3, r12, [r0, #24]\n\t" - "strd r4, r5, [r0, #56]\n\t" - "strd r6, r7, [r0, #88]\n\t" - "add sp, sp, #0\n\t" - : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) - : - : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); -} - -void fe_mul(fe r, const fe a, const fe b) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x40\n\t" - /* Multiply */ - "ldr r8, [r1]\n\t" - "ldr r9, [r1, #4]\n\t" - "ldr r10, [r2]\n\t" - "ldr r4, [r2, #4]\n\t" - /* A[0] * B[0] = 0 */ - "umull r5, r6, r8, r10\n\t" - "str r5, [sp]\n\t" - /* A[0] * B[1] = 1 */ - "umull r3, r7, r8, r4\n\t" - "adds r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[1] * B[0] = 1 */ - "umull r3, r12, r9, r10\n\t" - "adds r6, r6, r3\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #4]\n\t" - /* A[2] * B[0] = 2 */ - "ldr r11, [r1, #8]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r7, r7, r3\n\t" - "adc r5, r5, r12\n\t" - /* A[1] * B[1] = 2 */ - "umull r3, r12, r9, r4\n\t" - "adds r7, r7, r3\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[0] * B[2] = 2 */ - "ldr lr, [r2, #8]\n\t" - "umull r3, r12, r8, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #8]\n\t" - /* A[0] * B[3] = 3 */ - "ldr lr, [r2, #12]\n\t" - "umull r3, r12, r8, lr\n\t" - "adds r5, r5, r3\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[1] * B[2] = 3 */ - "ldr lr, [r2, #8]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[2] * B[1] = 3 */ - "umull r3, r12, r11, r4\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[3] * B[0] = 3 */ - "ldr r11, [r1, #12]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #12]\n\t" - /* A[4] * B[0] = 4 */ - "ldr r11, [r1, #16]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r6, r6, r3\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[1] = 4 */ - "ldr r11, [r1, #12]\n\t" - "umull r3, r12, r11, r4\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[2] = 4 */ - "ldr r11, [r1, #8]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[3] = 4 */ - "ldr lr, [r2, #12]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[0] * B[4] = 4 */ - "ldr lr, [r2, #16]\n\t" - "umull r3, r12, r8, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #16]\n\t" - /* A[0] * B[5] = 5 */ - "ldr lr, [r2, #20]\n\t" - "umull r3, r12, r8, lr\n\t" - "adds r7, r7, r3\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * B[4] = 5 */ - "ldr lr, [r2, #16]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * B[3] = 5 */ - "ldr lr, [r2, #12]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * B[2] = 5 */ - "ldr r11, [r1, #12]\n\t" - "ldr lr, [r2, #8]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[4] * B[1] = 5 */ - "ldr r11, [r1, #16]\n\t" - "umull r3, r12, r11, r4\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * B[0] = 5 */ - "ldr r11, [r1, #20]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #20]\n\t" - /* A[6] * B[0] = 6 */ - "ldr r11, [r1, #24]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r5, r5, r3\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[5] * B[1] = 6 */ - "ldr r11, [r1, #20]\n\t" - "umull r3, r12, r11, r4\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[4] * B[2] = 6 */ - "ldr r11, [r1, #16]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[3] * B[3] = 6 */ - "ldr r11, [r1, #12]\n\t" - "ldr lr, [r2, #12]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[2] * B[4] = 6 */ - "ldr r11, [r1, #8]\n\t" - "ldr lr, [r2, #16]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[1] * B[5] = 6 */ - "ldr lr, [r2, #20]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[0] * B[6] = 6 */ - "ldr lr, [r2, #24]\n\t" - "umull r3, r12, r8, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #24]\n\t" - /* A[0] * B[7] = 7 */ - "ldr lr, [r2, #28]\n\t" - "umull r3, r12, r8, lr\n\t" - "adds r6, r6, r3\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[6] = 7 */ - "ldr lr, [r2, #24]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[5] = 7 */ - "ldr lr, [r2, #20]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[4] = 7 */ - "ldr r11, [r1, #12]\n\t" - "ldr lr, [r2, #16]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[3] = 7 */ - "ldr r11, [r1, #16]\n\t" - "ldr lr, [r2, #12]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[2] = 7 */ - "ldr r11, [r1, #20]\n\t" - "ldr lr, [r2, #8]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[1] = 7 */ - "ldr r11, [r1, #24]\n\t" - "umull r3, r12, r11, r4\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[0] = 7 */ - "ldr r11, [r1, #28]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #28]\n\t" - "ldr r8, [r1, #24]\n\t" - "ldr r10, [r2, #24]\n\t" - /* A[7] * B[1] = 8 */ - "umull r3, r12, r11, r4\n\t" - "adds r7, r7, r3\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[6] * B[2] = 8 */ - "umull r3, r12, r8, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * B[3] = 8 */ - "ldr r11, [r1, #20]\n\t" - "ldr lr, [r2, #12]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[4] * B[4] = 8 */ - "ldr r11, [r1, #16]\n\t" - "ldr lr, [r2, #16]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * B[5] = 8 */ - "ldr r11, [r1, #12]\n\t" - "ldr lr, [r2, #20]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * B[6] = 8 */ - "ldr r11, [r1, #8]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * B[7] = 8 */ - "ldr lr, [r2, #28]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #32]\n\t" - "ldr r9, [r1, #28]\n\t" - "mov r4, lr\n\t" - /* A[2] * B[7] = 9 */ - "umull r3, r12, r11, r4\n\t" - "adds r5, r5, r3\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[3] * B[6] = 9 */ - "ldr r11, [r1, #12]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[4] * B[5] = 9 */ - "ldr r11, [r1, #16]\n\t" - "ldr lr, [r2, #20]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[5] * B[4] = 9 */ - "ldr r11, [r1, #20]\n\t" - "ldr lr, [r2, #16]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[6] * B[3] = 9 */ - "ldr lr, [r2, #12]\n\t" - "umull r3, r12, r8, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[7] * B[2] = 9 */ - "ldr lr, [r2, #8]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #36]\n\t" - /* A[7] * B[3] = 10 */ - "ldr lr, [r2, #12]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r6, r6, r3\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[4] = 10 */ - "ldr lr, [r2, #16]\n\t" - "umull r3, r12, r8, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[5] = 10 */ - "ldr lr, [r2, #20]\n\t" - "umull r3, r12, r11, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[6] = 10 */ - "ldr r11, [r1, #16]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[7] = 10 */ - "ldr r11, [r1, #12]\n\t" - "umull r3, r12, r11, r4\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #40]\n\t" - /* A[4] * B[7] = 11 */ - "ldr r11, [r1, #16]\n\t" - "umull r3, r12, r11, r4\n\t" - "adds r7, r7, r3\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * B[6] = 11 */ - "ldr r11, [r1, #20]\n\t" - "umull r3, r12, r11, r10\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[6] * B[5] = 11 */ - "umull r3, r12, r8, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[7] * B[4] = 11 */ - "ldr lr, [r2, #16]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r7, r7, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #44]\n\t" - /* A[7] * B[5] = 12 */ - "ldr lr, [r2, #20]\n\t" - "umull r3, r12, r9, lr\n\t" - "adds r5, r5, r3\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[6] * B[6] = 12 */ - "umull r3, r12, r8, r10\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - /* A[5] * B[7] = 12 */ - "umull r3, r12, r11, r4\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #48]\n\t" - /* A[6] * B[7] = 13 */ - "umull r3, r12, r8, r4\n\t" - "adds r6, r6, r3\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[6] = 13 */ - "umull r3, r12, r9, r10\n\t" - "adds r6, r6, r3\n\t" - "adcs r7, r7, r12\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #52]\n\t" - /* A[7] * B[7] = 14 */ - "umull r3, r12, r9, r4\n\t" - "adds r7, r7, r3\n\t" - "adc r5, r5, r12\n\t" - "str r7, [sp, #56]\n\t" - "str r5, [sp, #60]\n\t" - /* Reduce */ - /* Load bottom half */ - "ldrd r5, r6, [sp]\n\t" - "ldrd r7, r8, [sp, #8]\n\t" - "ldrd r9, r10, [sp, #16]\n\t" - "ldrd r11, lr, [sp, #24]\n\t" - "lsr r3, lr, #31\n\t" - "and lr, lr, #0x7fffffff\n\t" - "mov r4, #19\n\t" - "ldr %[a], [sp, #32]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, r4, r3\n\t" - "adds r5, r5, r3\n\t" - "mov %[b], #0\n\t" - "adcs r6, r6, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #36]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, r4, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r6, r6, r3\n\t" - "mov %[b], #0\n\t" - "adcs r7, r7, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #40]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, r4, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r7, r7, r3\n\t" - "mov %[b], #0\n\t" - "adcs r8, r8, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #44]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, r4, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r8, r8, r3\n\t" - "mov %[b], #0\n\t" - "adcs r9, r9, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #48]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, r4, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r9, r9, r3\n\t" - "mov %[b], #0\n\t" - "adcs r10, r10, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #52]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, r4, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r10, r10, r3\n\t" - "mov %[b], #0\n\t" - "adcs r11, r11, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #56]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, r4, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r11, r11, r3\n\t" - "mov %[b], #0\n\t" - "adcs lr, lr, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #60]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, r4, r3\n\t" - "adds lr, lr, r3\n\t" - "adc r3, r12, %[b]\n\t" - /* Overflow */ - "lsl r3, r3, #1\n\t" - "orr r3, r3, lr, lsr #31\n\t" - "mul r3, r3, r4\n\t" - "and lr, lr, #0x7fffffff\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adcs r11, r11, #0\n\t" - "adc lr, lr, #0\n\t" - /* Reduce if top bit set */ - "asr r3, lr, #31\n\t" - "and r3, r3, r4\n\t" - "and lr, lr, #0x7fffffff\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adcs r11, r11, #0\n\t" - "adc lr, lr, #0\n\t" - /* Store */ - "strd r5, r6, [r0]\n\t" - "strd r7, r8, [r0, #8]\n\t" - "strd r9, r10, [r0, #16]\n\t" - "strd r11, lr, [r0, #24]\n\t" - "add sp, sp, #0x40\n\t" - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); -} - -void fe_sq(fe r, const fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x40\n\t" - /* Square */ - "ldr r8, [r1]\n\t" - "ldr r9, [r1, #4]\n\t" - "ldr r10, [r1, #8]\n\t" - "ldr r11, [r1, #12]\n\t" - "ldr r12, [r1, #16]\n\t" - /* A[0] * A[0] = 0 */ - "umull r5, r6, r8, r8\n\t" - "str r5, [sp]\n\t" - /* A[0] * A[1] = 1 */ - "umull r2, r3, r8, r9\n\t" - "mov r7, #0\n\t" - "adds r6, r6, r2\n\t" - "adc r7, r7, r3\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #4]\n\t" - /* A[1] * A[1] = 2 */ - "umull r2, r3, r9, r9\n\t" - "adds r7, r7, r2\n\t" - "adc r5, r5, r3\n\t" - /* A[0] * A[2] = 2 */ - "umull r2, r3, r8, r10\n\t" - "adds r7, r7, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #8]\n\t" - /* A[0] * A[3] = 3 */ - "umull r2, r3, r8, r11\n\t" - "adds r5, r5, r2\n\t" - "adc r6, r6, r3\n\t" - "adds r5, r5, r2\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[1] * A[2] = 3 */ - "umull r2, r3, r9, r10\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #12]\n\t" - /* A[2] * A[2] = 4 */ - "umull r2, r3, r10, r10\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * A[3] = 4 */ - "umull r2, r3, r9, r11\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[0] * A[4] = 4 */ - "umull r2, r3, r8, r12\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #16]\n\t" - /* A[0] * A[5] = 5 */ - "ldr lr, [r1, #20]\n\t" - "umull r2, r3, r8, lr\n\t" - "adds r7, r7, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * A[4] = 5 */ - "umull r2, r3, r9, r12\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * A[3] = 5 */ - "umull r2, r3, r10, r11\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #20]\n\t" - /* A[3] * A[3] = 6 */ - "umull r2, r3, r11, r11\n\t" - "adds r5, r5, r2\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[2] * A[4] = 6 */ - "umull r2, r3, r10, r12\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[1] * A[5] = 6 */ - "umull r2, r3, r9, lr\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[0] * A[6] = 6 */ - "ldr lr, [r1, #24]\n\t" - "umull r2, r3, r8, lr\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #24]\n\t" - /* A[0] * A[7] = 7 */ - "ldr lr, [r1, #28]\n\t" - "umull r2, r3, r8, lr\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * A[6] = 7 */ - "ldr lr, [r1, #24]\n\t" - "umull r2, r3, r9, lr\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * A[5] = 7 */ - "ldr lr, [r1, #20]\n\t" - "umull r2, r3, r10, lr\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * A[4] = 7 */ - "umull r2, r3, r11, r12\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #28]\n\t" - /* A[4] * A[4] = 8 */ - "umull r2, r3, r12, r12\n\t" - "adds r7, r7, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * A[5] = 8 */ - "umull r2, r3, r11, lr\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * A[6] = 8 */ - "ldr lr, [r1, #24]\n\t" - "umull r2, r3, r10, lr\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * A[7] = 8 */ - "ldr lr, [r1, #28]\n\t" - "umull r2, r3, r9, lr\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #32]\n\t" - "ldr r8, [r1, #20]\n\t" - /* A[2] * A[7] = 9 */ - "umull r2, r3, r10, lr\n\t" - "adds r5, r5, r2\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[3] * A[6] = 9 */ - "ldr lr, [r1, #24]\n\t" - "umull r2, r3, r11, lr\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[4] * A[5] = 9 */ - "umull r2, r3, r12, r8\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #36]\n\t" - "mov r9, lr\n\t" - /* A[5] * A[5] = 10 */ - "umull r2, r3, r8, r8\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * A[6] = 10 */ - "umull r2, r3, r12, r9\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * A[7] = 10 */ - "ldr lr, [r1, #28]\n\t" - "umull r2, r3, r11, lr\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #40]\n\t" - "mov r10, lr\n\t" - /* A[4] * A[7] = 11 */ - "umull r2, r3, r12, r10\n\t" - "adds r7, r7, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * A[6] = 11 */ - "umull r2, r3, r8, r9\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #44]\n\t" - /* A[6] * A[6] = 12 */ - "umull r2, r3, r9, r9\n\t" - "adds r5, r5, r2\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[5] * A[7] = 12 */ - "umull r2, r3, r8, r10\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #48]\n\t" - /* A[6] * A[7] = 13 */ - "umull r2, r3, r9, r10\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #52]\n\t" - /* A[7] * A[7] = 14 */ - "umull r2, r3, r10, r10\n\t" - "adds r7, r7, r2\n\t" - "adc r5, r5, r3\n\t" - "str r7, [sp, #56]\n\t" - "str r5, [sp, #60]\n\t" - /* Reduce */ - /* Load bottom half */ - "ldrd r5, r6, [sp]\n\t" - "ldrd r7, r8, [sp, #8]\n\t" - "ldrd r9, r10, [sp, #16]\n\t" - "ldrd r11, lr, [sp, #24]\n\t" - "lsr r2, lr, #31\n\t" - "and lr, lr, #0x7fffffff\n\t" - "mov r12, #19\n\t" - "ldr %[a], [sp, #32]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #36]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r6, r6, r2\n\t" - "mov r4, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #40]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r7, r7, r2\n\t" - "mov r4, #0\n\t" - "adcs r8, r8, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #44]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r8, r8, r2\n\t" - "mov r4, #0\n\t" - "adcs r9, r9, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #48]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r9, r9, r2\n\t" - "mov r4, #0\n\t" - "adcs r10, r10, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #52]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r10, r10, r2\n\t" - "mov r4, #0\n\t" - "adcs r11, r11, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #56]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r11, r11, r2\n\t" - "mov r4, #0\n\t" - "adcs lr, lr, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #60]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "adds lr, lr, r2\n\t" - "adc r2, r3, r4\n\t" - /* Overflow */ - "lsl r2, r2, #1\n\t" - "orr r2, r2, lr, lsr #31\n\t" - "mul r2, r2, r12\n\t" - "and lr, lr, #0x7fffffff\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adcs r11, r11, #0\n\t" - "adc lr, lr, #0\n\t" - /* Reduce if top bit set */ - "asr r2, lr, #31\n\t" - "and r2, r2, r12\n\t" - "and lr, lr, #0x7fffffff\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adcs r11, r11, #0\n\t" - "adc lr, lr, #0\n\t" - /* Store */ - "strd r5, r6, [r0]\n\t" - "strd r7, r8, [r0, #8]\n\t" - "strd r9, r10, [r0, #16]\n\t" - "strd r11, lr, [r0, #24]\n\t" - "add sp, sp, #0x40\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); -} - -void fe_mul121666(fe r, fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0\n\t" - /* Multiply by 121666 */ - "ldrd r2, r3, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "movw r4, #0xdb42\n\t" - "movt r4, #1\n\t" - "umull r2, r11, r2, r4\n\t" - "umull r3, r12, r3, r4\n\t" - "adds r3, r3, r11\n\t" - "adc r11, r12, #0\n\t" - "umull r5, r12, r5, r4\n\t" - "adds r5, r5, r11\n\t" - "adc r11, r12, #0\n\t" - "umull r6, r12, r6, r4\n\t" - "adds r6, r6, r11\n\t" - "adc r11, r12, #0\n\t" - "umull r7, r12, r7, r4\n\t" - "adds r7, r7, r11\n\t" - "adc r11, r12, #0\n\t" - "umull r8, r12, r8, r4\n\t" - "adds r8, r8, r11\n\t" - "adc r11, r12, #0\n\t" - "umull r9, r12, r9, r4\n\t" - "adds r9, r9, r11\n\t" - "adc r11, r12, #0\n\t" - "umull r10, r12, r10, r4\n\t" - "adds r10, r10, r11\n\t" - "adc r11, r12, #0\n\t" - "mov r4, #19\n\t" - "lsl r11, r11, #1\n\t" - "orr r11, r11, r10, lsr #31\n\t" - "mul r11, r11, r4\n\t" - "and r10, r10, #0x7fffffff\n\t" - "adds r2, r2, r11\n\t" - "adcs r3, r3, #0\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adc r10, r10, #0\n\t" - "strd r2, r3, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" - "add sp, sp, #0\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); -} - -void fe_sq2(fe r, const fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x40\n\t" - /* Square * 2 */ - "ldr r8, [r1]\n\t" - "ldr r9, [r1, #4]\n\t" - "ldr r10, [r1, #8]\n\t" - "ldr r11, [r1, #12]\n\t" - "ldr r12, [r1, #16]\n\t" - /* A[0] * A[0] = 0 */ - "umull r5, r6, r8, r8\n\t" - "str r5, [sp]\n\t" - /* A[0] * A[1] = 1 */ - "umull r2, r3, r8, r9\n\t" - "mov r7, #0\n\t" - "adds r6, r6, r2\n\t" - "adc r7, r7, r3\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #4]\n\t" - /* A[1] * A[1] = 2 */ - "umull r2, r3, r9, r9\n\t" - "adds r7, r7, r2\n\t" - "adc r5, r5, r3\n\t" - /* A[0] * A[2] = 2 */ - "umull r2, r3, r8, r10\n\t" - "adds r7, r7, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #8]\n\t" - /* A[0] * A[3] = 3 */ - "umull r2, r3, r8, r11\n\t" - "adds r5, r5, r2\n\t" - "adc r6, r6, r3\n\t" - "adds r5, r5, r2\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[1] * A[2] = 3 */ - "umull r2, r3, r9, r10\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #12]\n\t" - /* A[2] * A[2] = 4 */ - "umull r2, r3, r10, r10\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * A[3] = 4 */ - "umull r2, r3, r9, r11\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[0] * A[4] = 4 */ - "umull r2, r3, r8, r12\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #16]\n\t" - /* A[0] * A[5] = 5 */ - "ldr lr, [r1, #20]\n\t" - "umull r2, r3, r8, lr\n\t" - "adds r7, r7, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * A[4] = 5 */ - "umull r2, r3, r9, r12\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * A[3] = 5 */ - "umull r2, r3, r10, r11\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #20]\n\t" - /* A[3] * A[3] = 6 */ - "umull r2, r3, r11, r11\n\t" - "adds r5, r5, r2\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[2] * A[4] = 6 */ - "umull r2, r3, r10, r12\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[1] * A[5] = 6 */ - "umull r2, r3, r9, lr\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[0] * A[6] = 6 */ - "ldr lr, [r1, #24]\n\t" - "umull r2, r3, r8, lr\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #24]\n\t" - /* A[0] * A[7] = 7 */ - "ldr lr, [r1, #28]\n\t" - "umull r2, r3, r8, lr\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * A[6] = 7 */ - "ldr lr, [r1, #24]\n\t" - "umull r2, r3, r9, lr\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * A[5] = 7 */ - "ldr lr, [r1, #20]\n\t" - "umull r2, r3, r10, lr\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * A[4] = 7 */ - "umull r2, r3, r11, r12\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #28]\n\t" - /* A[4] * A[4] = 8 */ - "umull r2, r3, r12, r12\n\t" - "adds r7, r7, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * A[5] = 8 */ - "umull r2, r3, r11, lr\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * A[6] = 8 */ - "ldr lr, [r1, #24]\n\t" - "umull r2, r3, r10, lr\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * A[7] = 8 */ - "ldr lr, [r1, #28]\n\t" - "umull r2, r3, r9, lr\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #32]\n\t" - "ldr r8, [r1, #20]\n\t" - /* A[2] * A[7] = 9 */ - "umull r2, r3, r10, lr\n\t" - "adds r5, r5, r2\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[3] * A[6] = 9 */ - "ldr lr, [r1, #24]\n\t" - "umull r2, r3, r11, lr\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[4] * A[5] = 9 */ - "umull r2, r3, r12, r8\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #36]\n\t" - "mov r9, lr\n\t" - /* A[5] * A[5] = 10 */ - "umull r2, r3, r8, r8\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * A[6] = 10 */ - "umull r2, r3, r12, r9\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * A[7] = 10 */ - "ldr lr, [r1, #28]\n\t" - "umull r2, r3, r11, lr\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #40]\n\t" - "mov r10, lr\n\t" - /* A[4] * A[7] = 11 */ - "umull r2, r3, r12, r10\n\t" - "adds r7, r7, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * A[6] = 11 */ - "umull r2, r3, r8, r9\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r7, r7, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r7, [sp, #44]\n\t" - /* A[6] * A[6] = 12 */ - "umull r2, r3, r9, r9\n\t" - "adds r5, r5, r2\n\t" - "mov r7, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - /* A[5] * A[7] = 12 */ - "umull r2, r3, r8, r10\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r7, r7, #0\n\t" - "str r5, [sp, #48]\n\t" - /* A[6] * A[7] = 13 */ - "umull r2, r3, r9, r10\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r7, r7, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #52]\n\t" - /* A[7] * A[7] = 14 */ - "umull r2, r3, r10, r10\n\t" - "adds r7, r7, r2\n\t" - "adc r5, r5, r3\n\t" - "str r7, [sp, #56]\n\t" - "str r5, [sp, #60]\n\t" - /* Double and Reduce */ - /* Load bottom half */ - "ldrd r5, r6, [sp]\n\t" - "ldrd r7, r8, [sp, #8]\n\t" - "ldrd r9, r10, [sp, #16]\n\t" - "ldrd r11, lr, [sp, #24]\n\t" - "lsr r2, lr, #30\n\t" - "lsl lr, lr, #1\n\t" - "orr lr, lr, r11, lsr #31\n\t" - "lsl r11, r11, #1\n\t" - "orr r11, r11, r10, lsr #31\n\t" - "lsl r10, r10, #1\n\t" - "orr r10, r10, r9, lsr #31\n\t" - "lsl r9, r9, #1\n\t" - "orr r9, r9, r8, lsr #31\n\t" - "lsl r8, r8, #1\n\t" - "orr r8, r8, r7, lsr #31\n\t" - "lsl r7, r7, #1\n\t" - "orr r7, r7, r6, lsr #31\n\t" - "lsl r6, r6, #1\n\t" - "orr r6, r6, r5, lsr #31\n\t" - "lsl r5, r5, #1\n\t" - "and lr, lr, #0x7fffffff\n\t" - "mov r12, #19\n\t" - "ldr %[a], [sp, #32]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #36]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r6, r6, r2\n\t" - "mov r4, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #40]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r7, r7, r2\n\t" - "mov r4, #0\n\t" - "adcs r8, r8, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #44]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r8, r8, r2\n\t" - "mov r4, #0\n\t" - "adcs r9, r9, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #48]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r9, r9, r2\n\t" - "mov r4, #0\n\t" - "adcs r10, r10, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #52]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r10, r10, r2\n\t" - "mov r4, #0\n\t" - "adcs r11, r11, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #56]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, r4\n\t" - "adds r11, r11, r2\n\t" - "mov r4, #0\n\t" - "adcs lr, lr, r3\n\t" - "adc r4, r4, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #60]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "adds lr, lr, r2\n\t" - "adc r2, r3, r4\n\t" - /* Overflow */ - "lsl r2, r2, #1\n\t" - "orr r2, r2, lr, lsr #31\n\t" - "mul r2, r2, r12\n\t" - "and lr, lr, #0x7fffffff\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adcs r11, r11, #0\n\t" - "adc lr, lr, #0\n\t" - /* Reduce if top bit set */ - "asr r2, lr, #31\n\t" - "and r2, r2, r12\n\t" - "and lr, lr, #0x7fffffff\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adcs r11, r11, #0\n\t" - "adc lr, lr, #0\n\t" - /* Store */ - "strd r5, r6, [r0]\n\t" - "strd r7, r8, [r0, #8]\n\t" - "strd r9, r10, [r0, #16]\n\t" - "strd r11, lr, [r0, #24]\n\t" - "add sp, sp, #0x40\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); -} - -void fe_invert(fe r, const fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x90\n\t" - /* Invert */ - "str %[r], [sp, #128]\n\t" - "str %[a], [sp, #132]\n\t" - "mov r0, sp\n\t" - "ldr r1, [sp, #132]\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "ldr r1, [sp, #132]\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r0, sp\n\t" - "mov r1, sp\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #64\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #4\n\t" - "\n" - "L_fe_invert1:\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert1\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #9\n\t" - "\n" - "L_fe_invert2:\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert2\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "mov r4, #19\n\t" - "\n" - "L_fe_invert3:\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert3\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #96\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "mov r4, #10\n\t" - "\n" - "L_fe_invert4:\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert4\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #0x31\n\t" - "\n" - "L_fe_invert5:\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert5\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "mov r4, #0x63\n\t" - "\n" - "L_fe_invert6:\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert6\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #96\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "mov r4, #0x32\n\t" - "\n" - "L_fe_invert7:\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert7\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r4, #5\n\t" - "\n" - "L_fe_invert8:\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert8\n\t" - "ldr r0, [sp, #128]\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "ldr %[a], [sp, #132]\n\t" - "ldr %[r], [sp, #128]\n\t" - "add sp, sp, #0x90\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "lr", "r4" - ); -} - -int curve25519(byte* r, byte* n, byte* a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0xc0\n\t" - "str %[r], [sp, #160]\n\t" - "str %[n], [sp, #164]\n\t" - "str %[a], [sp, #168]\n\t" - "mov %[n], #0\n\t" - "str %[n], [sp, #172]\n\t" - /* Set one */ - "mov lr, #1\n\t" - "mov r11, #0\n\t" - "strd lr, r11, [r0]\n\t" - "strd r11, r11, [r0, #8]\n\t" - "strd r11, r11, [r0, #16]\n\t" - "strd r11, r11, [r0, #24]\n\t" - /* Set zero */ - "mov r11, #0\n\t" - "strd r11, r11, [sp]\n\t" - "strd r11, r11, [sp, #8]\n\t" - "strd r11, r11, [sp, #16]\n\t" - "strd r11, r11, [sp, #24]\n\t" - /* Set one */ - "mov lr, #1\n\t" - "mov r11, #0\n\t" - "strd lr, r11, [sp, #32]\n\t" - "strd r11, r11, [sp, #40]\n\t" - "strd r11, r11, [sp, #48]\n\t" - "strd r11, r11, [sp, #56]\n\t" - /* Copy */ - "ldrd r5, r6, [r2]\n\t" - "ldrd r7, r8, [r2, #8]\n\t" - "strd r5, r6, [sp, #64]\n\t" - "strd r7, r8, [sp, #72]\n\t" - "ldrd r5, r6, [r2, #16]\n\t" - "ldrd r7, r8, [r2, #24]\n\t" - "strd r5, r6, [sp, #80]\n\t" - "strd r7, r8, [sp, #88]\n\t" - "mov %[n], #30\n\t" - "str %[n], [sp, #180]\n\t" - "mov %[a], #28\n\t" - "str %[a], [sp, #176]\n\t" - "\n" - "L_curve25519_words:\n\t" - "\n" - "L_curve25519_bits:\n\t" - "ldr %[n], [sp, #164]\n\t" - "ldr %[a], [r1, r2]\n\t" - "ldr %[n], [sp, #180]\n\t" - "lsr %[a], %[a], %[n]\n\t" - "and %[a], %[a], #1\n\t" - "str %[a], [sp, #184]\n\t" - "ldr %[n], [sp, #172]\n\t" - "eor %[n], %[n], %[a]\n\t" - "str %[n], [sp, #172]\n\t" - "ldr %[r], [sp, #160]\n\t" - /* Conditional Swap */ - "neg %[n], %[n]\n\t" - "ldrd r5, r6, [r0]\n\t" - "ldrd r7, r8, [sp, #64]\n\t" - "eor r9, r5, r7\n\t" - "eor r10, r6, r8\n\t" - "and r9, r9, %[n]\n\t" - "and r10, r10, %[n]\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r10\n\t" - "eor r7, r7, r9\n\t" - "eor r8, r8, r10\n\t" - "strd r5, r6, [r0]\n\t" - "strd r7, r8, [sp, #64]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" - "ldrd r7, r8, [sp, #72]\n\t" - "eor r9, r5, r7\n\t" - "eor r10, r6, r8\n\t" - "and r9, r9, %[n]\n\t" - "and r10, r10, %[n]\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r10\n\t" - "eor r7, r7, r9\n\t" - "eor r8, r8, r10\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [sp, #72]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" - "ldrd r7, r8, [sp, #80]\n\t" - "eor r9, r5, r7\n\t" - "eor r10, r6, r8\n\t" - "and r9, r9, %[n]\n\t" - "and r10, r10, %[n]\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r10\n\t" - "eor r7, r7, r9\n\t" - "eor r8, r8, r10\n\t" - "strd r5, r6, [r0, #16]\n\t" - "strd r7, r8, [sp, #80]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" - "ldrd r7, r8, [sp, #88]\n\t" - "eor r9, r5, r7\n\t" - "eor r10, r6, r8\n\t" - "and r9, r9, %[n]\n\t" - "and r10, r10, %[n]\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r10\n\t" - "eor r7, r7, r9\n\t" - "eor r8, r8, r10\n\t" - "strd r5, r6, [r0, #24]\n\t" - "strd r7, r8, [sp, #88]\n\t" - "ldr %[n], [sp, #172]\n\t" - /* Conditional Swap */ - "neg %[n], %[n]\n\t" - "ldrd r5, r6, [sp]\n\t" - "ldrd r7, r8, [sp, #32]\n\t" - "eor r9, r5, r7\n\t" - "eor r10, r6, r8\n\t" - "and r9, r9, %[n]\n\t" - "and r10, r10, %[n]\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r10\n\t" - "eor r7, r7, r9\n\t" - "eor r8, r8, r10\n\t" - "strd r5, r6, [sp]\n\t" - "strd r7, r8, [sp, #32]\n\t" - "ldrd r5, r6, [sp, #8]\n\t" - "ldrd r7, r8, [sp, #40]\n\t" - "eor r9, r5, r7\n\t" - "eor r10, r6, r8\n\t" - "and r9, r9, %[n]\n\t" - "and r10, r10, %[n]\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r10\n\t" - "eor r7, r7, r9\n\t" - "eor r8, r8, r10\n\t" - "strd r5, r6, [sp, #8]\n\t" - "strd r7, r8, [sp, #40]\n\t" - "ldrd r5, r6, [sp, #16]\n\t" - "ldrd r7, r8, [sp, #48]\n\t" - "eor r9, r5, r7\n\t" - "eor r10, r6, r8\n\t" - "and r9, r9, %[n]\n\t" - "and r10, r10, %[n]\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r10\n\t" - "eor r7, r7, r9\n\t" - "eor r8, r8, r10\n\t" - "strd r5, r6, [sp, #16]\n\t" - "strd r7, r8, [sp, #48]\n\t" - "ldrd r5, r6, [sp, #24]\n\t" - "ldrd r7, r8, [sp, #56]\n\t" - "eor r9, r5, r7\n\t" - "eor r10, r6, r8\n\t" - "and r9, r9, %[n]\n\t" - "and r10, r10, %[n]\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r10\n\t" - "eor r7, r7, r9\n\t" - "eor r8, r8, r10\n\t" - "strd r5, r6, [sp, #24]\n\t" - "strd r7, r8, [sp, #56]\n\t" - "ldr %[n], [sp, #184]\n\t" - "str %[n], [sp, #172]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd r5, r6, [r0]\n\t" - "ldrd r7, r8, [sp]\n\t" - "adds r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [r0]\n\t" - /* Sub */ - "subs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp, #128]\n\t" - /* Add */ - "ldrd r5, r6, [r0, #8]\n\t" - "ldrd r7, r8, [sp, #8]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [r0, #8]\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp, #136]\n\t" - /* Add */ - "ldrd r5, r6, [r0, #16]\n\t" - "ldrd r7, r8, [sp, #16]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [r0, #16]\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp, #144]\n\t" - /* Add */ - "ldrd r5, r6, [r0, #24]\n\t" - "ldrd r7, r8, [sp, #24]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "adc r10, r6, r8\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "sbc lr, r6, r8\n\t" - "mov r3, #-19\n\t" - "asr %[a], r10, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd r5, r6, [r0]\n\t" - "subs r5, r5, r3\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" - "sbcs r5, r5, %[a]\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" - "sbcs r5, r5, %[a]\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [r0, #16]\n\t" - "sbcs r9, r9, %[a]\n\t" - "sbc r10, r10, r12\n\t" - "strd r9, r10, [r0, #24]\n\t" - "mov r3, #-19\n\t" - "asr %[a], lr, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd r5, r6, [sp, #128]\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #128]\n\t" - "ldrd r5, r6, [sp, #136]\n\t" - "adcs r5, r5, %[a]\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #136]\n\t" - "ldrd r5, r6, [sp, #144]\n\t" - "adcs r5, r5, %[a]\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #144]\n\t" - "adcs r11, r11, %[a]\n\t" - "adc lr, lr, r12\n\t" - "strd r11, lr, [sp, #152]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd r5, r6, [sp, #64]\n\t" - "ldrd r7, r8, [sp, #32]\n\t" - "adds r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [sp]\n\t" - /* Sub */ - "subs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp, #96]\n\t" - /* Add */ - "ldrd r5, r6, [sp, #72]\n\t" - "ldrd r7, r8, [sp, #40]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [sp, #8]\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp, #104]\n\t" - /* Add */ - "ldrd r5, r6, [sp, #80]\n\t" - "ldrd r7, r8, [sp, #48]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [sp, #16]\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp, #112]\n\t" - /* Add */ - "ldrd r5, r6, [sp, #88]\n\t" - "ldrd r7, r8, [sp, #56]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "adc r10, r6, r8\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "sbc lr, r6, r8\n\t" - "mov r3, #-19\n\t" - "asr %[a], r10, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd r5, r6, [sp]\n\t" - "subs r5, r5, r3\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp]\n\t" - "ldrd r5, r6, [sp, #8]\n\t" - "sbcs r5, r5, %[a]\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #8]\n\t" - "ldrd r5, r6, [sp, #16]\n\t" - "sbcs r5, r5, %[a]\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #16]\n\t" - "sbcs r9, r9, %[a]\n\t" - "sbc r10, r10, r12\n\t" - "strd r9, r10, [sp, #24]\n\t" - "mov r3, #-19\n\t" - "asr %[a], lr, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd r5, r6, [sp, #96]\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #96]\n\t" - "ldrd r5, r6, [sp, #104]\n\t" - "adcs r5, r5, %[a]\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #104]\n\t" - "ldrd r5, r6, [sp, #112]\n\t" - "adcs r5, r5, %[a]\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #112]\n\t" - "adcs r11, r11, %[a]\n\t" - "adc lr, lr, r12\n\t" - "strd r11, lr, [sp, #120]\n\t" - "ldr r2, [sp, #160]\n\t" - "add r1, sp, #0x60\n\t" - "add r0, sp, #0x20\n\t" - "bl fe_mul\n\t" - "add r2, sp, #0x80\n\t" - "add r1, sp, #0\n\t" - "add r0, sp, #0\n\t" - "bl fe_mul\n\t" - "add r1, sp, #0x80\n\t" - "add r0, sp, #0x60\n\t" - "bl fe_sq\n\t" - "ldr r1, [sp, #160]\n\t" - "add r0, sp, #0x80\n\t" - "bl fe_sq\n\t" - /* Add-Sub */ - /* Add */ - "ldrd r5, r6, [sp, #32]\n\t" - "ldrd r7, r8, [sp]\n\t" - "adds r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [sp, #64]\n\t" - /* Sub */ - "subs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp]\n\t" - /* Add */ - "ldrd r5, r6, [sp, #40]\n\t" - "ldrd r7, r8, [sp, #8]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [sp, #72]\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp, #8]\n\t" - /* Add */ - "ldrd r5, r6, [sp, #48]\n\t" - "ldrd r7, r8, [sp, #16]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "mov r3, #0\n\t" - "adcs r10, r6, r8\n\t" - "adc r3, r3, #0\n\t" - "strd r9, r10, [sp, #80]\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "mov r12, #0\n\t" - "sbcs lr, r6, r8\n\t" - "adc r12, r12, #0\n\t" - "strd r11, lr, [sp, #16]\n\t" - /* Add */ - "ldrd r5, r6, [sp, #56]\n\t" - "ldrd r7, r8, [sp, #24]\n\t" - "adds r3, r3, #-1\n\t" - "adcs r9, r5, r7\n\t" - "adc r10, r6, r8\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r11, r5, r7\n\t" - "sbc lr, r6, r8\n\t" - "mov r3, #-19\n\t" - "asr %[a], r10, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd r5, r6, [sp, #64]\n\t" - "subs r5, r5, r3\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #64]\n\t" - "ldrd r5, r6, [sp, #72]\n\t" - "sbcs r5, r5, %[a]\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #72]\n\t" - "ldrd r5, r6, [sp, #80]\n\t" - "sbcs r5, r5, %[a]\n\t" - "sbcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #80]\n\t" - "sbcs r9, r9, %[a]\n\t" - "sbc r10, r10, r12\n\t" - "strd r9, r10, [sp, #88]\n\t" - "mov r3, #-19\n\t" - "asr %[a], lr, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd r5, r6, [sp]\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp]\n\t" - "ldrd r5, r6, [sp, #8]\n\t" - "adcs r5, r5, %[a]\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #8]\n\t" - "ldrd r5, r6, [sp, #16]\n\t" - "adcs r5, r5, %[a]\n\t" - "adcs r6, r6, %[a]\n\t" - "strd r5, r6, [sp, #16]\n\t" - "adcs r11, r11, %[a]\n\t" - "adc lr, lr, r12\n\t" - "strd r11, lr, [sp, #24]\n\t" - "add r2, sp, #0x60\n\t" - "add r1, sp, #0x80\n\t" - "ldr r0, [sp, #160]\n\t" - "bl fe_mul\n\t" - /* Sub */ - "ldrd r5, r6, [sp, #128]\n\t" - "ldrd r7, r8, [sp, #136]\n\t" - "ldrd r9, r10, [sp, #96]\n\t" - "ldrd r11, lr, [sp, #104]\n\t" - "subs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" - "sbcs lr, r8, lr\n\t" - "strd r9, r10, [sp, #128]\n\t" - "strd r11, lr, [sp, #136]\n\t" - "ldrd r5, r6, [sp, #144]\n\t" - "ldrd r7, r8, [sp, #152]\n\t" - "ldrd r9, r10, [sp, #112]\n\t" - "ldrd r11, lr, [sp, #120]\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" - "sbc lr, r8, lr\n\t" - "mov r3, #-19\n\t" - "asr %[a], lr, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd r5, r6, [sp, #128]\n\t" - "ldrd r7, r8, [sp, #136]\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, %[a]\n\t" - "adcs r7, r7, %[a]\n\t" - "adcs r8, r8, %[a]\n\t" - "adcs r9, r9, %[a]\n\t" - "adcs r10, r10, %[a]\n\t" - "adcs r11, r11, %[a]\n\t" - "adc lr, lr, r12\n\t" - "strd r5, r6, [sp, #128]\n\t" - "strd r7, r8, [sp, #136]\n\t" - "strd r9, r10, [sp, #144]\n\t" - "strd r11, lr, [sp, #152]\n\t" - "add r1, sp, #0\n\t" - "add r0, sp, #0\n\t" - "bl fe_sq\n\t" - /* Multiply by 121666 */ - "ldrd r5, r6, [sp, #128]\n\t" - "ldrd r7, r8, [sp, #136]\n\t" - "ldrd r9, r10, [sp, #144]\n\t" - "ldrd r11, lr, [sp, #152]\n\t" - "movw r12, #0xdb42\n\t" - "movt r12, #1\n\t" - "umull r5, %[a], r5, r12\n\t" - "umull r6, r3, r6, r12\n\t" - "adds r6, r6, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r7, r3, r7, r12\n\t" - "adds r7, r7, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r8, r3, r8, r12\n\t" - "adds r8, r8, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r9, r3, r9, r12\n\t" - "adds r9, r9, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r10, r3, r10, r12\n\t" - "adds r10, r10, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r11, r3, r11, r12\n\t" - "adds r11, r11, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull lr, r3, lr, r12\n\t" - "adds lr, lr, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "mov r12, #19\n\t" - "lsl %[a], %[a], #1\n\t" - "orr %[a], %[a], lr, lsr #31\n\t" - "mul %[a], %[a], r12\n\t" - "and lr, lr, #0x7fffffff\n\t" - "adds r5, r5, %[a]\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adcs r11, r11, #0\n\t" - "adc lr, lr, #0\n\t" - "strd r5, r6, [sp, #32]\n\t" - "strd r7, r8, [sp, #40]\n\t" - "strd r9, r10, [sp, #48]\n\t" - "strd r11, lr, [sp, #56]\n\t" - "add r1, sp, #0x40\n\t" - "add r0, sp, #0x40\n\t" - "bl fe_sq\n\t" - /* Add */ - "ldrd r5, r6, [sp, #96]\n\t" - "ldrd r7, r8, [sp, #104]\n\t" - "ldrd r9, r10, [sp, #32]\n\t" - "ldrd r11, lr, [sp, #40]\n\t" - "adds r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" - "adcs lr, r8, lr\n\t" - "strd r9, r10, [sp, #96]\n\t" - "strd r11, lr, [sp, #104]\n\t" - "ldrd r5, r6, [sp, #112]\n\t" - "ldrd r7, r8, [sp, #120]\n\t" - "ldrd r9, r10, [sp, #48]\n\t" - "ldrd r11, lr, [sp, #56]\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" - "adc lr, r8, lr\n\t" - "mov r3, #-19\n\t" - "asr %[a], lr, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd r5, r6, [sp, #96]\n\t" - "ldrd r7, r8, [sp, #104]\n\t" - "subs r5, r5, r3\n\t" - "sbcs r6, r6, %[a]\n\t" - "sbcs r7, r7, %[a]\n\t" - "sbcs r8, r8, %[a]\n\t" - "sbcs r9, r9, %[a]\n\t" - "sbcs r10, r10, %[a]\n\t" - "sbcs r11, r11, %[a]\n\t" - "sbc lr, lr, r12\n\t" - "strd r5, r6, [sp, #96]\n\t" - "strd r7, r8, [sp, #104]\n\t" - "strd r9, r10, [sp, #112]\n\t" - "strd r11, lr, [sp, #120]\n\t" - "add r2, sp, #0\n\t" - "ldr r1, [sp, #168]\n\t" - "add r0, sp, #0x20\n\t" - "bl fe_mul\n\t" - "add r2, sp, #0x60\n\t" - "add r1, sp, #0x80\n\t" - "add r0, sp, #0\n\t" - "bl fe_mul\n\t" - "ldr %[a], [sp, #176]\n\t" - "ldr %[n], [sp, #180]\n\t" - "subs %[n], %[n], #1\n\t" - "str %[n], [sp, #180]\n\t" - "bge L_curve25519_bits\n\t" - "mov %[n], #31\n\t" - "str %[n], [sp, #180]\n\t" - "subs %[a], %[a], #4\n\t" - "str %[a], [sp, #176]\n\t" - "bge L_curve25519_words\n\t" - /* Invert */ - "add r0, sp, #32\n\t" - "add r1, sp, #0\n\t" - "bl fe_sq\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #0\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #96\n\t" - "bl fe_mul\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "mov r4, #4\n\t" - "\n" - "L_curve25519_inv_1:\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_1\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #96\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "mov r4, #9\n\t" - "\n" - "L_curve25519_inv_2:\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_2\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "add r0, sp, #128\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "mov r4, #19\n\t" - "\n" - "L_curve25519_inv_3:\n\t" - "add r0, sp, #128\n\t" - "add r1, sp, #128\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_3\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #128\n\t" - "add r2, sp, #96\n\t" - "bl fe_mul\n\t" - "mov r4, #10\n\t" - "\n" - "L_curve25519_inv_4:\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_4\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #96\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "mov r4, #0x31\n\t" - "\n" - "L_curve25519_inv_5:\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_5\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "add r0, sp, #128\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "mov r4, #0x63\n\t" - "\n" - "L_curve25519_inv_6:\n\t" - "add r0, sp, #128\n\t" - "add r1, sp, #128\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_6\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #128\n\t" - "add r2, sp, #96\n\t" - "bl fe_mul\n\t" - "mov r4, #0x32\n\t" - "\n" - "L_curve25519_inv_7:\n\t" - "add r0, sp, #96\n\t" - "add r1, sp, #96\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_7\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #96\n\t" - "add r2, sp, #64\n\t" - "bl fe_mul\n\t" - "mov r4, #5\n\t" - "\n" - "L_curve25519_inv_8:\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_8\n\t" - "add r0, sp, #0\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r2, sp, #0\n\t" - "ldr r1, [sp, #160]\n\t" - "ldr r0, [sp, #160]\n\t" - "bl fe_mul\n\t" - "mov r0, #0\n\t" - "add sp, sp, #0xc0\n\t" - : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) - : - : "memory", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); - return (uint32_t)(size_t)r; -} - -void fe_pow22523(fe r, const fe a) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x70\n\t" - /* pow22523 */ - "str %[r], [sp, #96]\n\t" - "str %[a], [sp, #100]\n\t" - "mov r0, sp\n\t" - "ldr r1, [sp, #100]\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "ldr r1, [sp, #100]\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r0, sp\n\t" - "mov r1, sp\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r0, sp\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "mov r0, sp\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "mov r4, #4\n\t" - "\n" - "L_fe_pow22523_1:\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_1\n\t" - "mov r0, sp\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "mov r4, #9\n\t" - "\n" - "L_fe_pow22523_2:\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_2\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #19\n\t" - "\n" - "L_fe_pow22523_3:\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_3\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r4, #10\n\t" - "\n" - "L_fe_pow22523_4:\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_4\n\t" - "mov r0, sp\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "mov r4, #0x31\n\t" - "\n" - "L_fe_pow22523_5:\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_5\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #0x63\n\t" - "\n" - "L_fe_pow22523_6:\n\t" - "add r0, sp, #64\n\t" - "add r1, sp, #64\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_6\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #64\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r4, #0x32\n\t" - "\n" - "L_fe_pow22523_7:\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_7\n\t" - "mov r0, sp\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "mov r4, #2\n\t" - "\n" - "L_fe_pow22523_8:\n\t" - "mov r0, sp\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_8\n\t" - "ldr r0, [sp, #96]\n\t" - "mov r1, sp\n\t" - "ldr r2, [sp, #100]\n\t" - "bl fe_mul\n\t" - "ldr %[a], [sp, #100]\n\t" - "ldr %[r], [sp, #96]\n\t" - "add sp, sp, #0x70\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "lr", "r4" - ); -} - -void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) -{ - __asm__ __volatile__ ( - "sub sp, sp, #16\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[px], [sp, #12]\n\t" - "ldr r2, [sp, #28]\n\t" - "ldr r1, [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #24]\n\t" - "ldr r1, [sp, #20]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #28]\n\t" - "ldr r1, [sp, #24]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px) - : - : "memory", "lr" - ); - (void)py; - (void)pz; - (void)pt; -} - -void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) -{ - __asm__ __volatile__ ( - "sub sp, sp, #16\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r2, [sp, #32]\n\t" - "ldr r1, [sp, #20]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #28]\n\t" - "ldr r1, [sp, #24]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #32]\n\t" - "ldr r1, [sp, #28]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #24]\n\t" - "ldr r1, [sp, #20]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "lr" - ); - (void)px; - (void)py; - (void)pz; - (void)pt; -} - -void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) -{ - __asm__ __volatile__ ( - "sub sp, sp, #16\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r1, [sp, #52]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_sq\n\t" - "ldr r1, [sp, #56]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_sq\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #52]\n\t" - "ldr r2, [sp, #56]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "adds r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "adcs r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_sq\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #8]\n\t" - "ldr r2, [sp]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r2]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1, #8]\n\t" - "ldrd r6, r7, [r2, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r2, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1, #24]\n\t" - "ldrd r6, r7, [r2, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #12]\n\t" - "ldr r2, [sp, #4]\n\t" - /* Sub */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "subs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "sbcs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "adcs r6, r6, lr\n\t" - "adcs r7, r7, lr\n\t" - "adcs r8, r8, lr\n\t" - "adcs r9, r9, lr\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r1, [sp, #60]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_sq2\n\t" - "ldr r0, [sp, #12]\n\t" - "ldr r1, [sp, #8]\n\t" - /* Sub */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "ldrd r8, r9, [r1]\n\t" - "ldrd r10, r11, [r1, #8]\n\t" - "subs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "ldrd r6, r7, [r0, #24]\n\t" - "ldrd r8, r9, [r1, #16]\n\t" - "ldrd r10, r11, [r1, #24]\n\t" - "sbcs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "adcs r6, r6, lr\n\t" - "adcs r7, r7, lr\n\t" - "adcs r8, r8, lr\n\t" - "adcs r9, r9, lr\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); - (void)px; - (void)py; - (void)pz; -} - -void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x20\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #72]\n\t" - "ldr r2, [sp, #68]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "adds r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "adcs r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #72]\n\t" - "ldr r2, [sp, #68]\n\t" - /* Sub */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "subs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "sbcs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "adcs r6, r6, lr\n\t" - "adcs r7, r7, lr\n\t" - "adcs r8, r8, lr\n\t" - "adcs r9, r9, lr\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r2, [sp, #88]\n\t" - "ldr r1, [sp]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #92]\n\t" - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #80]\n\t" - "ldr r1, [sp, #84]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp]\n\t" - "ldr r2, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r2]\n\t" - "ldrd r6, r7, [r0]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #8]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #16]\n\t" - "ldrd r6, r7, [r0, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #24]\n\t" - "ldrd r6, r7, [r0, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #76]\n\t" - /* Double */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r1, #16]\n\t" - "ldrd r10, r11, [r1, #24]\n\t" - "adds %[rt], %[rt], %[rt]\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #12]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r1]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r0, #8]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r0, #16]\n\t" - "ldrd r6, r7, [r1, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r0, #24]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "add sp, sp, #0x20\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); - (void)px; - (void)py; - (void)pz; - (void)pt; - (void)qxy2d; - (void)qyplusx; - (void)qyminusx; -} - -void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x20\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #72]\n\t" - "ldr r2, [sp, #68]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "adds r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "adcs r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #72]\n\t" - "ldr r2, [sp, #68]\n\t" - /* Sub */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "subs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "sbcs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "adcs r6, r6, lr\n\t" - "adcs r7, r7, lr\n\t" - "adcs r8, r8, lr\n\t" - "adcs r9, r9, lr\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r2, [sp, #92]\n\t" - "ldr r1, [sp]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #88]\n\t" - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #80]\n\t" - "ldr r1, [sp, #84]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp]\n\t" - "ldr r2, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r2]\n\t" - "ldrd r6, r7, [r0]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #8]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #16]\n\t" - "ldrd r6, r7, [r0, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #24]\n\t" - "ldrd r6, r7, [r0, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #76]\n\t" - /* Double */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r1, #16]\n\t" - "ldrd r10, r11, [r1, #24]\n\t" - "adds %[rt], %[rt], %[rt]\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r0, [sp, #12]\n\t" - "ldr r1, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r0]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1, #8]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r0, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1, #24]\n\t" - "ldrd r6, r7, [r0, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "add sp, sp, #0x20\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); - (void)px; - (void)py; - (void)pz; - (void)pt; - (void)qxy2d; - (void)qyplusx; - (void)qyminusx; -} - -void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x60\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #136]\n\t" - "ldr r2, [sp, #132]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "adds r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "adcs r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #136]\n\t" - "ldr r2, [sp, #132]\n\t" - /* Sub */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "subs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "sbcs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "adcs r6, r6, lr\n\t" - "adcs r7, r7, lr\n\t" - "adcs r8, r8, lr\n\t" - "adcs r9, r9, lr\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r2, [sp, #156]\n\t" - "ldr r1, [sp]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #160]\n\t" - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #144]\n\t" - "ldr r1, [sp, #152]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #148]\n\t" - "ldr r1, [sp, #140]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" - "add r0, sp, #16\n\t" - "ldr r1, [sp]\n\t" - /* Double */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r1, #16]\n\t" - "ldrd r10, r11, [r1, #24]\n\t" - "adds %[rt], %[rt], %[rt]\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp]\n\t" - "ldr r2, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r2]\n\t" - "ldrd r6, r7, [r0]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #8]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #16]\n\t" - "ldrd r6, r7, [r0, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #24]\n\t" - "ldrd r6, r7, [r0, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #12]\n\t" - "add r2, sp, #16\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r2]\n\t" - "ldrd r6, r7, [r1]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #8]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #16]\n\t" - "ldrd r6, r7, [r1, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #24]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "add sp, sp, #0x60\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); - (void)px; - (void)py; - (void)pz; - (void)pt; - (void)qz; - (void)qt2d; - (void)qyplusx; - (void)qyminusx; -} - -void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) -{ - __asm__ __volatile__ ( - "sub sp, sp, #0x60\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #136]\n\t" - "ldr r2, [sp, #132]\n\t" - /* Add */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "adds r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "adcs r8, %[rt], r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #136]\n\t" - "ldr r2, [sp, #132]\n\t" - /* Sub */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r2]\n\t" - "ldrd r10, r11, [r2, #8]\n\t" - "subs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" - "strd r8, r9, [r0]\n\t" - "strd r10, r11, [r0, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "ldrd r6, r7, [r1, #24]\n\t" - "ldrd r8, r9, [r2, #16]\n\t" - "ldrd r10, r11, [r2, #24]\n\t" - "sbcs r8, %[rt], r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "adcs r6, r6, lr\n\t" - "adcs r7, r7, lr\n\t" - "adcs r8, r8, lr\n\t" - "adcs r9, r9, lr\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r2, [sp, #160]\n\t" - "ldr r1, [sp]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #156]\n\t" - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #144]\n\t" - "ldr r1, [sp, #152]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #148]\n\t" - "ldr r1, [sp, #140]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" - "add r0, sp, #16\n\t" - "ldr r1, [sp]\n\t" - /* Double */ - "ldrd %[rt], r5, [r1]\n\t" - "ldrd r6, r7, [r1, #8]\n\t" - "ldrd r8, r9, [r1, #16]\n\t" - "ldrd r10, r11, [r1, #24]\n\t" - "adds %[rt], %[rt], %[rt]\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "sbcs r6, r6, lr\n\t" - "sbcs r7, r7, lr\n\t" - "sbcs r8, r8, lr\n\t" - "sbcs r9, r9, lr\n\t" - "sbcs r10, r10, lr\n\t" - "sbc r11, r11, r4\n\t" - "strd %[rt], r5, [r0]\n\t" - "strd r6, r7, [r0, #8]\n\t" - "strd r8, r9, [r0, #16]\n\t" - "strd r10, r11, [r0, #24]\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp]\n\t" - "ldr r2, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r2]\n\t" - "ldrd r6, r7, [r0]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #8]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #16]\n\t" - "ldrd r6, r7, [r0, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #24]\n\t" - "ldrd r6, r7, [r0, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "ldr r0, [sp, #12]\n\t" - "ldr r1, [sp, #8]\n\t" - "add r2, sp, #16\n\t" - /* Add-Sub */ - /* Add */ - "ldrd %[rt], r5, [r2]\n\t" - "ldrd r6, r7, [r0]\n\t" - "adds r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0]\n\t" - /* Sub */ - "subs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #8]\n\t" - "ldrd r6, r7, [r0, #8]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #8]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #8]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #16]\n\t" - "ldrd r6, r7, [r0, #16]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" - "strd r8, r9, [r0, #16]\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "mov r4, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r4, r4, #0\n\t" - "strd r10, r11, [r1, #16]\n\t" - /* Add */ - "ldrd %[rt], r5, [r2, #24]\n\t" - "ldrd r6, r7, [r0, #24]\n\t" - "adds r12, r12, #-1\n\t" - "adcs r8, %[rt], r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r4, r4, #-1\n\t" - "sbcs r10, %[rt], r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr lr, r9, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "ldrd %[rt], r5, [r0]\n\t" - "subs %[rt], %[rt], r12\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0]\n\t" - "ldrd %[rt], r5, [r0, #8]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #8]\n\t" - "ldrd %[rt], r5, [r0, #16]\n\t" - "sbcs %[rt], %[rt], lr\n\t" - "sbcs r5, r5, lr\n\t" - "strd %[rt], r5, [r0, #16]\n\t" - "sbcs r8, r8, lr\n\t" - "sbc r9, r9, r4\n\t" - "strd r8, r9, [r0, #24]\n\t" - "mov r12, #-19\n\t" - "asr lr, r11, #31\n\t" - /* Mask the modulus */ - "and r12, lr, r12\n\t" - "and r4, lr, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ - "ldrd %[rt], r5, [r1]\n\t" - "adds %[rt], %[rt], r12\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1]\n\t" - "ldrd %[rt], r5, [r1, #8]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #8]\n\t" - "ldrd %[rt], r5, [r1, #16]\n\t" - "adcs %[rt], %[rt], lr\n\t" - "adcs r5, r5, lr\n\t" - "strd %[rt], r5, [r1, #16]\n\t" - "adcs r10, r10, lr\n\t" - "adc r11, r11, r4\n\t" - "strd r10, r11, [r1, #24]\n\t" - "add sp, sp, #0x60\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "lr" - ); - (void)px; - (void)py; - (void)pz; - (void)pt; - (void)qz; - (void)qt2d; - (void)qyplusx; - (void)qyminusx; -} - -#endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv7-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S similarity index 95% rename from wolfcrypt/src/port/arm/armv7-curve25519.S rename to wolfcrypt/src/port/arm/armv8-32-curve25519.S index 223a67c22..2d85bce04 100644 --- a/wolfcrypt/src/port/arm/armv7-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -1,4 +1,4 @@ -/* armv7-curve25519 +/* armv8-32-curve25519 * * Copyright (C) 2006-2019 wolfSSL Inc. * @@ -19,18 +19,22 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.S + */ #ifndef __aarch64__ -.text -.globl fe_init -.type fe_init, %function -.align 2 + .text + .align 2 + .globl fe_init + .type fe_init, %function fe_init: bx lr -.size fe_init,.-fe_init -.text -.globl fe_frombytes -.type fe_frombytes, %function -.align 2 + .size fe_init,.-fe_init + .text + .align 2 + .globl fe_frombytes + .type fe_frombytes, %function fe_frombytes: push {r4, r5, r6, r7, lr} ldrd r2, r3, [r1] @@ -45,11 +49,11 @@ fe_frombytes: strd r4, r5, [r0, #16] strd r6, r7, [r0, #24] pop {r4, r5, r6, r7, pc} -.size fe_frombytes,.-fe_frombytes -.text -.globl fe_tobytes -.type fe_tobytes, %function -.align 2 + .size fe_frombytes,.-fe_frombytes + .text + .align 2 + .globl fe_tobytes + .type fe_tobytes, %function fe_tobytes: push {r4, r5, r6, r7, r8, lr} ldrd r2, r3, [r1] @@ -82,11 +86,11 @@ fe_tobytes: strd r4, r5, [r0, #16] strd r6, r7, [r0, #24] pop {r4, r5, r6, r7, r8, pc} -.size fe_tobytes,.-fe_tobytes -.text -.globl fe_1 -.type fe_1, %function -.align 2 + .size fe_tobytes,.-fe_tobytes + .text + .align 2 + .globl fe_1 + .type fe_1, %function fe_1: # Set one mov r2, #1 @@ -100,11 +104,11 @@ fe_1: str r1, [r0, #24] str r1, [r0, #28] bx lr -.size fe_1,.-fe_1 -.text -.globl fe_0 -.type fe_0, %function -.align 2 + .size fe_1,.-fe_1 + .text + .align 2 + .globl fe_0 + .type fe_0, %function fe_0: # Set zero mov r1, #0 @@ -117,11 +121,11 @@ fe_0: str r1, [r0, #24] str r1, [r0, #28] bx lr -.size fe_0,.-fe_0 -.text -.globl fe_copy -.type fe_copy, %function -.align 2 + .size fe_0,.-fe_0 + .text + .align 2 + .globl fe_copy + .type fe_copy, %function fe_copy: push {lr} # Copy @@ -138,11 +142,11 @@ fe_copy: str r12, [r0, #24] str lr, [r0, #28] pop {pc} -.size fe_copy,.-fe_copy -.text -.globl fe_sub -.type fe_sub, %function -.align 2 + .size fe_copy,.-fe_copy + .text + .align 2 + .globl fe_sub + .type fe_sub, %function fe_sub: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} # Sub @@ -189,11 +193,11 @@ fe_sub: strd r6, r7, [r0, #16] strd r8, r9, [r0, #24] pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_sub,.-fe_sub -.text -.globl fe_add -.type fe_add, %function -.align 2 + .size fe_sub,.-fe_sub + .text + .align 2 + .globl fe_add + .type fe_add, %function fe_add: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} # Add @@ -240,11 +244,11 @@ fe_add: strd r6, r7, [r0, #16] strd r8, r9, [r0, #24] pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_add,.-fe_add -.text -.globl fe_neg -.type fe_neg, %function -.align 2 + .size fe_add,.-fe_add + .text + .align 2 + .globl fe_neg + .type fe_neg, %function fe_neg: push {r4, r5, lr} mov r5, #-1 @@ -271,11 +275,11 @@ fe_neg: str r12, [r0, #24] str lr, [r0, #28] pop {r4, r5, pc} -.size fe_neg,.-fe_neg -.text -.globl fe_isnonzero -.type fe_isnonzero, %function -.align 2 + .size fe_neg,.-fe_neg + .text + .align 2 + .globl fe_isnonzero + .type fe_isnonzero, %function fe_isnonzero: push {r4, r5, r6, r7, r8, lr} ldrd r2, r3, [r0] @@ -310,11 +314,11 @@ fe_isnonzero: orr r2, r2, r6 orr r0, r2, r12 pop {r4, r5, r6, r7, r8, pc} -.size fe_isnonzero,.-fe_isnonzero -.text -.globl fe_isnegative -.type fe_isnegative, %function -.align 2 + .size fe_isnonzero,.-fe_isnonzero + .text + .align 2 + .globl fe_isnegative + .type fe_isnegative, %function fe_isnegative: push {lr} ldrd r2, r3, [r0] @@ -336,11 +340,11 @@ fe_isnegative: lsr r1, r1, #31 eor r0, r0, r1 pop {pc} -.size fe_isnegative,.-fe_isnegative -.text -.globl fe_cmov_table -.type fe_cmov_table, %function -.align 2 + .size fe_isnegative,.-fe_isnegative + .text + .align 2 + .globl fe_cmov_table + .type fe_cmov_table, %function fe_cmov_table: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sxtb r2, r2 @@ -1319,11 +1323,11 @@ fe_cmov_table: str r5, [r0, #88] str r6, [r0, #92] pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_cmov_table,.-fe_cmov_table -.text -.globl fe_mul -.type fe_mul, %function -.align 2 + .size fe_cmov_table,.-fe_cmov_table + .text + .align 2 + .globl fe_mul + .type fe_mul, %function fe_mul: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #0x40 @@ -1848,11 +1852,11 @@ fe_mul: strd r10, r11, [r0, #24] add sp, sp, #0x40 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_mul,.-fe_mul -.text -.globl fe_sq -.type fe_sq, %function -.align 2 + .size fe_mul,.-fe_mul + .text + .align 2 + .globl fe_sq + .type fe_sq, %function fe_sq: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #0x40 @@ -2269,11 +2273,11 @@ fe_sq: strd r10, r11, [r0, #24] add sp, sp, #0x40 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_sq,.-fe_sq -.text -.globl fe_mul121666 -.type fe_mul121666, %function -.align 2 + .size fe_sq,.-fe_sq + .text + .align 2 + .globl fe_mul121666 + .type fe_mul121666, %function fe_mul121666: push {r4, r5, r6, r7, r8, r9, r10, lr} # Multiply by 121666 @@ -2323,11 +2327,11 @@ fe_mul121666: strd r6, r7, [r0, #16] strd r8, r9, [r0, #24] pop {r4, r5, r6, r7, r8, r9, r10, pc} -.size fe_mul121666,.-fe_mul121666 -.text -.globl fe_sq2 -.type fe_sq2, %function -.align 2 + .size fe_mul121666,.-fe_mul121666 + .text + .align 2 + .globl fe_sq2 + .type fe_sq2, %function fe_sq2: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #0x40 @@ -2759,11 +2763,11 @@ fe_sq2: strd r10, r11, [r0, #24] add sp, sp, #0x40 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_sq2,.-fe_sq2 -.text -.globl fe_invert -.type fe_invert, %function -.align 2 + .size fe_sq2,.-fe_sq2 + .text + .align 2 + .globl fe_invert + .type fe_invert, %function fe_invert: push {r4, lr} sub sp, sp, #0x88 @@ -2787,110 +2791,110 @@ fe_invert: mov r1, sp add r2, sp, #32 bl fe_mul - add r0, sp, #64 + add r0, sp, #0x40 mov r1, sp bl fe_sq add r0, sp, #32 add r1, sp, #32 - add r2, sp, #64 + add r2, sp, #0x40 bl fe_mul - add r0, sp, #64 + add r0, sp, #0x40 add r1, sp, #32 bl fe_sq mov r4, #4 L_fe_invert1: - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_invert1 add r0, sp, #32 - add r1, sp, #64 + add r1, sp, #0x40 add r2, sp, #32 bl fe_mul - add r0, sp, #64 + add r0, sp, #0x40 add r1, sp, #32 bl fe_sq mov r4, #9 L_fe_invert2: - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_invert2 - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 add r2, sp, #32 bl fe_mul - add r0, sp, #96 - add r1, sp, #64 + add r0, sp, #0x60 + add r1, sp, #0x40 bl fe_sq mov r4, #19 L_fe_invert3: - add r0, sp, #96 - add r1, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x60 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_invert3 - add r0, sp, #64 - add r1, sp, #96 - add r2, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 bl fe_mul mov r4, #10 L_fe_invert4: - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_invert4 add r0, sp, #32 - add r1, sp, #64 + add r1, sp, #0x40 add r2, sp, #32 bl fe_mul - add r0, sp, #64 + add r0, sp, #0x40 add r1, sp, #32 bl fe_sq - mov r4, #0x31 + mov r4, #49 L_fe_invert5: - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_invert5 - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 add r2, sp, #32 bl fe_mul - add r0, sp, #96 - add r1, sp, #64 + add r0, sp, #0x60 + add r1, sp, #0x40 bl fe_sq mov r4, #0x63 L_fe_invert6: - add r0, sp, #96 - add r1, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x60 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_invert6 - add r0, sp, #64 - add r1, sp, #96 - add r2, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 bl fe_mul - mov r4, #0x32 + mov r4, #50 L_fe_invert7: - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_invert7 add r0, sp, #32 - add r1, sp, #64 + add r1, sp, #0x40 add r2, sp, #32 bl fe_mul mov r4, #5 @@ -2909,11 +2913,11 @@ L_fe_invert8: ldr r0, [sp, #128] add sp, sp, #0x88 pop {r4, pc} -.size fe_invert,.-fe_invert -.text -.globl curve25519 -.type curve25519, %function -.align 2 + .size fe_invert,.-fe_invert + .text + .align 2 + .globl curve25519 + .type curve25519, %function curve25519: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #0xbc @@ -3282,7 +3286,7 @@ L_curve25519_bits: strd r10, r11, [sp, #120] ldr r2, [sp, #160] add r1, sp, #0x60 - add r0, sp, #0x20 + add r0, sp, #32 bl fe_mul add r2, sp, #0x80 add r1, sp, #0 @@ -3529,7 +3533,7 @@ L_curve25519_bits: strd r10, r11, [sp, #120] add r2, sp, #0 ldr r1, [sp, #168] - add r0, sp, #0x20 + add r0, sp, #32 bl fe_mul add r2, sp, #0x60 add r1, sp, #0x80 @@ -3549,136 +3553,136 @@ L_curve25519_bits: add r0, sp, #32 add r1, sp, #0 bl fe_sq - add r0, sp, #64 + add r0, sp, #0x40 add r1, sp, #32 bl fe_sq - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq - add r0, sp, #64 + add r0, sp, #0x40 add r1, sp, #0 - add r2, sp, #64 + add r2, sp, #0x40 bl fe_mul add r0, sp, #32 add r1, sp, #32 - add r2, sp, #64 + add r2, sp, #0x40 bl fe_mul - add r0, sp, #96 + add r0, sp, #0x60 add r1, sp, #32 bl fe_sq - add r0, sp, #64 - add r1, sp, #64 - add r2, sp, #96 + add r0, sp, #0x40 + add r1, sp, #0x40 + add r2, sp, #0x60 bl fe_mul - add r0, sp, #96 - add r1, sp, #64 + add r0, sp, #0x60 + add r1, sp, #0x40 bl fe_sq mov r4, #4 L_curve25519_inv_1: - add r0, sp, #96 - add r1, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x60 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_curve25519_inv_1 - add r0, sp, #64 - add r1, sp, #96 - add r2, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 bl fe_mul - add r0, sp, #96 - add r1, sp, #64 + add r0, sp, #0x60 + add r1, sp, #0x40 bl fe_sq mov r4, #9 L_curve25519_inv_2: - add r0, sp, #96 - add r1, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x60 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_curve25519_inv_2 - add r0, sp, #96 - add r1, sp, #96 - add r2, sp, #64 + add r0, sp, #0x60 + add r1, sp, #0x60 + add r2, sp, #0x40 bl fe_mul - add r0, sp, #128 - add r1, sp, #96 + add r0, sp, #0x80 + add r1, sp, #0x60 bl fe_sq mov r4, #19 L_curve25519_inv_3: - add r0, sp, #128 - add r1, sp, #128 + add r0, sp, #0x80 + add r1, sp, #0x80 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_curve25519_inv_3 - add r0, sp, #96 - add r1, sp, #128 - add r2, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x80 + add r2, sp, #0x60 bl fe_mul mov r4, #10 L_curve25519_inv_4: - add r0, sp, #96 - add r1, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x60 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_curve25519_inv_4 - add r0, sp, #64 - add r1, sp, #96 - add r2, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 bl fe_mul - add r0, sp, #96 - add r1, sp, #64 + add r0, sp, #0x60 + add r1, sp, #0x40 bl fe_sq - mov r4, #0x31 + mov r4, #49 L_curve25519_inv_5: - add r0, sp, #96 - add r1, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x60 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_curve25519_inv_5 - add r0, sp, #96 - add r1, sp, #96 - add r2, sp, #64 + add r0, sp, #0x60 + add r1, sp, #0x60 + add r2, sp, #0x40 bl fe_mul - add r0, sp, #128 - add r1, sp, #96 + add r0, sp, #0x80 + add r1, sp, #0x60 bl fe_sq mov r4, #0x63 L_curve25519_inv_6: - add r0, sp, #128 - add r1, sp, #128 + add r0, sp, #0x80 + add r1, sp, #0x80 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_curve25519_inv_6 - add r0, sp, #96 - add r1, sp, #128 - add r2, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x80 + add r2, sp, #0x60 bl fe_mul - mov r4, #0x32 + mov r4, #50 L_curve25519_inv_7: - add r0, sp, #96 - add r1, sp, #96 + add r0, sp, #0x60 + add r1, sp, #0x60 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_curve25519_inv_7 - add r0, sp, #64 - add r1, sp, #96 - add r2, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x60 + add r2, sp, #0x40 bl fe_mul mov r4, #5 L_curve25519_inv_8: - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_curve25519_inv_8 add r0, sp, #0 - add r1, sp, #64 + add r1, sp, #0x40 add r2, sp, #32 bl fe_mul add r2, sp, #0 @@ -3688,11 +3692,11 @@ L_curve25519_inv_8: mov r0, #0 add sp, sp, #0xbc pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size curve25519,.-curve25519 -.text -.globl fe_pow22523 -.type fe_pow22523, %function -.align 2 + .size curve25519,.-curve25519 + .text + .align 2 + .globl fe_pow22523 + .type fe_pow22523, %function fe_pow22523: push {r4, lr} sub sp, sp, #0x68 @@ -3753,19 +3757,19 @@ L_fe_pow22523_2: add r1, sp, #32 mov r2, sp bl fe_mul - add r0, sp, #64 + add r0, sp, #0x40 add r1, sp, #32 bl fe_sq mov r4, #19 L_fe_pow22523_3: - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_pow22523_3 add r0, sp, #32 - add r1, sp, #64 + add r1, sp, #0x40 add r2, sp, #32 bl fe_mul mov r4, #10 @@ -3783,7 +3787,7 @@ L_fe_pow22523_4: add r0, sp, #32 mov r1, sp bl fe_sq - mov r4, #0x31 + mov r4, #49 L_fe_pow22523_5: add r0, sp, #32 add r1, sp, #32 @@ -3795,22 +3799,22 @@ L_fe_pow22523_5: add r1, sp, #32 mov r2, sp bl fe_mul - add r0, sp, #64 + add r0, sp, #0x40 add r1, sp, #32 bl fe_sq mov r4, #0x63 L_fe_pow22523_6: - add r0, sp, #64 - add r1, sp, #64 + add r0, sp, #0x40 + add r1, sp, #0x40 bl fe_sq sub r4, r4, #1 cmp r4, #0 bne L_fe_pow22523_6 add r0, sp, #32 - add r1, sp, #64 + add r1, sp, #0x40 add r2, sp, #32 bl fe_mul - mov r4, #0x32 + mov r4, #50 L_fe_pow22523_7: add r0, sp, #32 add r1, sp, #32 @@ -3838,11 +3842,11 @@ L_fe_pow22523_8: ldr r0, [sp, #96] add sp, sp, #0x68 pop {r4, pc} -.size fe_pow22523,.-fe_pow22523 -.text -.globl fe_ge_to_p2 -.type fe_ge_to_p2, %function -.align 2 + .size fe_pow22523,.-fe_pow22523 + .text + .align 2 + .globl fe_ge_to_p2 + .type fe_ge_to_p2, %function fe_ge_to_p2: push {lr} sub sp, sp, #16 @@ -3864,11 +3868,11 @@ fe_ge_to_p2: bl fe_mul add sp, sp, #16 pop {pc} -.size fe_ge_to_p2,.-fe_ge_to_p2 -.text -.globl fe_ge_to_p3 -.type fe_ge_to_p3, %function -.align 2 + .size fe_ge_to_p2,.-fe_ge_to_p2 + .text + .align 2 + .globl fe_ge_to_p3 + .type fe_ge_to_p3, %function fe_ge_to_p3: push {lr} sub sp, sp, #16 @@ -3894,11 +3898,11 @@ fe_ge_to_p3: bl fe_mul add sp, sp, #16 pop {pc} -.size fe_ge_to_p3,.-fe_ge_to_p3 -.text -.globl fe_ge_dbl -.type fe_ge_dbl, %function -.align 2 + .size fe_ge_to_p3,.-fe_ge_to_p3 + .text + .align 2 + .globl fe_ge_dbl + .type fe_ge_dbl, %function fe_ge_dbl: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #16 @@ -4223,14 +4227,14 @@ fe_ge_dbl: str r10, [r0, #28] add sp, sp, #16 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_ge_dbl,.-fe_ge_dbl -.text -.globl fe_ge_madd -.type fe_ge_madd, %function -.align 2 + .size fe_ge_dbl,.-fe_ge_dbl + .text + .align 2 + .globl fe_ge_madd + .type fe_ge_madd, %function fe_ge_madd: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x20 + sub sp, sp, #32 str r0, [sp] str r1, [sp, #4] str r2, [sp, #8] @@ -4663,16 +4667,16 @@ fe_ge_madd: adc r10, r10, lr str r9, [r1, #24] str r10, [r1, #28] - add sp, sp, #0x20 + add sp, sp, #32 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_ge_madd,.-fe_ge_madd -.text -.globl fe_ge_msub -.type fe_ge_msub, %function -.align 2 + .size fe_ge_madd,.-fe_ge_madd + .text + .align 2 + .globl fe_ge_msub + .type fe_ge_msub, %function fe_ge_msub: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x20 + sub sp, sp, #32 str r0, [sp] str r1, [sp, #4] str r2, [sp, #8] @@ -5105,13 +5109,13 @@ fe_ge_msub: adc r10, r10, lr str r9, [r1, #24] str r10, [r1, #28] - add sp, sp, #0x20 + add sp, sp, #32 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_ge_msub,.-fe_ge_msub -.text -.globl fe_ge_add -.type fe_ge_add, %function -.align 2 + .size fe_ge_msub,.-fe_ge_msub + .text + .align 2 + .globl fe_ge_add + .type fe_ge_add, %function fe_ge_add: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #0x60 @@ -5554,11 +5558,11 @@ fe_ge_add: str r10, [r1, #28] add sp, sp, #0x60 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_ge_add,.-fe_ge_add -.text -.globl fe_ge_sub -.type fe_ge_sub, %function -.align 2 + .size fe_ge_add,.-fe_ge_add + .text + .align 2 + .globl fe_ge_sub + .type fe_ge_sub, %function fe_ge_sub: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #0x60 @@ -6001,5 +6005,5 @@ fe_ge_sub: str r10, [r1, #28] add sp, sp, #0x60 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} -.size fe_ge_sub,.-fe_ge_sub -#endif /* __aarch64__ */ + .size fe_ge_sub,.-fe_ge_sub +#endif /* !__aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.c b/wolfcrypt/src/port/arm/armv8-32-curve25519.c new file mode 100644 index 000000000..c1f3344d2 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.c @@ -0,0 +1,5576 @@ +/* armv8-32-curve25519 + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-curve25519.c + */ +#ifndef __aarch64__ +#include +#ifdef HAVE_CONFIG_H + #include +#endif + +#include +#include +#include + +void fe_init() +{ + __asm__ __volatile__ ( + "\n\t" + : + : + : "memory" + ); +} + +void fe_frombytes(fe out, const unsigned char* in) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[in]]\n\t" + "ldrd r12, lr, [%[in], #8]\n\t" + "ldrd r4, r5, [%[in], #16]\n\t" + "ldrd r6, r7, [%[in], #24]\n\t" + "and r7, r7, #0x7fffffff\n\t" + "strd r2, r3, [%[out]]\n\t" + "strd r12, lr, [%[out], #8]\n\t" + "strd r4, r5, [%[out], #16]\n\t" + "strd r6, r7, [%[out], #24]\n\t" + : [out] "+r" (out), [in] "+r" (in) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7" + ); +} + +void fe_tobytes(unsigned char* out, const fe n) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[in]]\n\t" + "ldrd r12, lr, [%[in], #8]\n\t" + "ldrd r4, r5, [%[in], #16]\n\t" + "ldrd r6, r7, [%[in], #24]\n\t" + "adds r8, r2, #19\n\t" + "adcs r8, r3, #0\n\t" + "adcs r8, r12, #0\n\t" + "adcs r8, lr, #0\n\t" + "adcs r8, r4, #0\n\t" + "adcs r8, r5, #0\n\t" + "adcs r8, r6, #0\n\t" + "adc r8, r7, #0\n\t" + "asr r8, r8, #31\n\t" + "and r8, r8, #19\n\t" + "adds r2, r2, r8\n\t" + "adcs r3, r3, #0\n\t" + "adcs r12, r12, #0\n\t" + "adcs lr, lr, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + "and r7, r7, #0x7fffffff\n\t" + "strd r2, r3, [%[out]]\n\t" + "strd r12, lr, [%[out], #8]\n\t" + "strd r4, r5, [%[out], #16]\n\t" + "strd r6, r7, [%[out], #24]\n\t" + : [out] "+r" (out), [n] "+r" (n) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8" + ); +} + +void fe_1(fe n) +{ + __asm__ __volatile__ ( + /* Set one */ + "mov r2, #1\n\t" + "mov r1, #0\n\t" + "strd r2, r1, [%[n]]\n\t" + "strd r1, r1, [%[n], #8]\n\t" + "strd r1, r1, [%[n], #16]\n\t" + "strd r1, r1, [%[n], #24]\n\t" + : [n] "+r" (n) + : + : "memory", "r1", "r2" + ); +} + +void fe_0(fe n) +{ + __asm__ __volatile__ ( + /* Set zero */ + "mov r1, #0\n\t" + "strd r1, r1, [%[n]]\n\t" + "strd r1, r1, [%[n], #8]\n\t" + "strd r1, r1, [%[n], #16]\n\t" + "strd r1, r1, [%[n], #24]\n\t" + : [n] "+r" (n) + : + : "memory", "r1" + ); +} + +void fe_copy(fe r, const fe a) +{ + __asm__ __volatile__ ( + /* Copy */ + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r12, lr, [%[r], #8]\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "strd r2, r3, [%[r], #16]\n\t" + "strd r12, lr, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr" + ); +} + +void fe_sub(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + /* Sub */ + "ldrd r12, lr, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b]]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "subs r6, r12, r6\n\t" + "sbcs r7, lr, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "strd r6, r7, [%[r]]\n\t" + "strd r8, r9, [%[r], #8]\n\t" + "ldrd r12, lr, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "sbcs r6, r12, r6\n\t" + "sbcs r7, lr, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbc r9, r5, r9\n\t" + "mov r10, #-19\n\t" + "asr r3, r9, #31\n\t" + /* Mask the modulus */ + "and r10, r3, r10\n\t" + "and r11, r3, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r12, lr, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "adds r12, r12, r10\n\t" + "adcs lr, lr, r3\n\t" + "adcs r4, r4, r3\n\t" + "adcs r5, r5, r3\n\t" + "adcs r6, r6, r3\n\t" + "adcs r7, r7, r3\n\t" + "adcs r8, r8, r3\n\t" + "adc r9, r9, r11\n\t" + "strd r12, lr, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_add(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + /* Add */ + "ldrd r12, lr, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[b]]\n\t" + "ldrd r8, r9, [%[b], #8]\n\t" + "adds r6, r12, r6\n\t" + "adcs r7, lr, r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "strd r6, r7, [%[r]]\n\t" + "strd r8, r9, [%[r], #8]\n\t" + "ldrd r12, lr, [%[a], #16]\n\t" + "ldrd r4, r5, [%[a], #24]\n\t" + "ldrd r6, r7, [%[b], #16]\n\t" + "ldrd r8, r9, [%[b], #24]\n\t" + "adcs r6, r12, r6\n\t" + "adcs r7, lr, r7\n\t" + "adcs r8, r4, r8\n\t" + "adc r9, r5, r9\n\t" + "mov r10, #-19\n\t" + "asr r3, r9, #31\n\t" + /* Mask the modulus */ + "and r10, r3, r10\n\t" + "and r11, r3, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r12, lr, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "subs r12, r12, r10\n\t" + "sbcs lr, lr, r3\n\t" + "sbcs r4, r4, r3\n\t" + "sbcs r5, r5, r3\n\t" + "sbcs r6, r6, r3\n\t" + "sbcs r7, r7, r3\n\t" + "sbcs r8, r8, r3\n\t" + "sbc r9, r9, r11\n\t" + "strd r12, lr, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_neg(fe r, const fe a) +{ + __asm__ __volatile__ ( + "mov r5, #-1\n\t" + "mov r4, #-19\n\t" + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "subs r2, r4, r2\n\t" + "sbcs r3, r5, r3\n\t" + "sbcs r12, r5, r12\n\t" + "sbcs lr, r5, lr\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r12, lr, [%[r], #8]\n\t" + "mov r4, #0x7fffffff\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "sbcs r2, r5, r2\n\t" + "sbcs r3, r5, r3\n\t" + "sbcs r12, r5, r12\n\t" + "sbc lr, r4, lr\n\t" + "strd r2, r3, [%[r], #16]\n\t" + "strd r12, lr, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5" + ); +} + +int fe_isnonzero(const fe a) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "ldrd r4, r5, [%[a], #16]\n\t" + "ldrd r6, r7, [%[a], #24]\n\t" + "adds r1, r2, #19\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "adcs r1, lr, #0\n\t" + "adcs r1, r4, #0\n\t" + "adcs r1, r5, #0\n\t" + "adcs r1, r6, #0\n\t" + "adc r1, r7, #0\n\t" + "asr r1, r1, #31\n\t" + "and r1, r1, #19\n\t" + "adds r2, r2, r1\n\t" + "adcs r3, r3, #0\n\t" + "adcs r12, r12, #0\n\t" + "adcs lr, lr, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + "and r7, r7, #0x7fffffff\n\t" + "orr r2, r2, r3\n\t" + "orr r12, r12, lr\n\t" + "orr r4, r4, r5\n\t" + "orr r6, r6, r7\n\t" + "orr r12, r12, r4\n\t" + "orr r2, r2, r6\n\t" + "orr %[a], r2, r12\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8" + ); + return (uint32_t)(size_t)a; +} + +int fe_isnegative(const fe a) +{ + __asm__ __volatile__ ( + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r12, lr, [%[a], #8]\n\t" + "adds r1, r2, #19\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "adcs r1, lr, #0\n\t" + "ldrd r2, r3, [%[a], #16]\n\t" + "ldrd r12, lr, [%[a], #24]\n\t" + "adcs r1, r2, #0\n\t" + "adcs r1, r3, #0\n\t" + "adcs r1, r12, #0\n\t" + "ldr r2, [%[a]]\n\t" + "adc r1, lr, #0\n\t" + "and %[a], r2, #1\n\t" + "lsr r1, r1, #31\n\t" + "eor %[a], %[a], r1\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r12", "lr" + ); + return (uint32_t)(size_t)a; +} + +void fe_cmov_table(fe* r, fe* base, signed char b) +{ + __asm__ __volatile__ ( + "sxtb %[b], %[b]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #1\n\t" + "mov r12, #0\n\t" + "mov lr, #1\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base]]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #32]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #64]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-19\n\t" + "mov r9, #-1\n\t" + "subs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r]]\n\t" + "strd lr, r4, [%[r], #32]\n\t" + "strd r5, r6, [%[r], #64]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #8]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #40]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #72]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #-1\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #8]\n\t" + "strd lr, r4, [%[r], #40]\n\t" + "strd r5, r6, [%[r], #72]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #16]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #48]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #80]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #-1\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbcs r9, r9, r6\n\t" + "sbc r11, r11, r11\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #16]\n\t" + "strd lr, r4, [%[r], #48]\n\t" + "strd r5, r6, [%[r], #80]\n\t" + "sbfx r7, %[b], #7, #1\n\t" + "eor r10, %[b], r7\n\t" + "sub r10, r10, r7\n\t" + "mov r3, #0\n\t" + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "mov r4, #0\n\t" + "mov r5, #0\n\t" + "mov r6, #0\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #31\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #30\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #29\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #28\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #27\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #26\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #25\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "add %[base], %[base], #0x60\n\t" + "mov r7, #0x80000000\n\t" + "ror r7, r7, #24\n\t" + "ror r7, r7, r10\n\t" + "asr r7, r7, #31\n\t" + "ldrd r8, r9, [%[base], #24]\n\t" + "eor r8, r8, r3\n\t" + "eor r9, r9, r12\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r3, r3, r8\n\t" + "eor r12, r12, r9\n\t" + "ldrd r8, r9, [%[base], #56]\n\t" + "eor r8, r8, lr\n\t" + "eor r9, r9, r4\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor lr, lr, r8\n\t" + "eor r4, r4, r9\n\t" + "ldrd r8, r9, [%[base], #88]\n\t" + "eor r8, r8, r5\n\t" + "eor r9, r9, r6\n\t" + "and r8, r8, r7\n\t" + "and r9, r9, r7\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "sub %[base], %[base], #0x2a0\n\t" + "mov r8, #-1\n\t" + "mov r9, #0x7fffffff\n\t" + "rsbs r11, r11, #0\n\t" + "sbcs r8, r8, r5\n\t" + "sbc r9, r9, r6\n\t" + "asr r10, %[b], #31\n\t" + "eor r7, r3, lr\n\t" + "and r7, r7, r10\n\t" + "eor r3, r3, r7\n\t" + "eor lr, lr, r7\n\t" + "eor r7, r12, r4\n\t" + "and r7, r7, r10\n\t" + "eor r12, r12, r7\n\t" + "eor r4, r4, r7\n\t" + "eor r8, r8, r5\n\t" + "and r8, r8, r10\n\t" + "eor r5, r5, r8\n\t" + "eor r9, r9, r6\n\t" + "and r9, r9, r10\n\t" + "eor r6, r6, r9\n\t" + "strd r3, r12, [%[r], #24]\n\t" + "strd lr, r4, [%[r], #56]\n\t" + "strd r5, r6, [%[r], #88]\n\t" + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_mul(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Multiply */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[b]]\n\t" + "ldr lr, [%[b], #4]\n\t" + /* A[0] * B[0] = 0 */ + "umull r4, r5, r7, r9\n\t" + "str r4, [sp]\n\t" + /* A[0] * B[1] = 1 */ + "umull r3, r6, r7, lr\n\t" + "adds r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[0] = 1 */ + "umull r3, r12, r8, r9\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[2] * B[0] = 2 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adc r4, r4, r12\n\t" + /* A[1] * B[1] = 2 */ + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[0] * B[2] = 2 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * B[3] = 3 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[2] = 3 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * B[1] = 3 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[0] = 3 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[4] * B[0] = 4 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[1] = 4 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[2] = 4 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * B[3] = 4 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * B[4] = 4 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * B[5] = 5 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[4] = 5 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[3] = 5 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[2] = 5 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[1] = 5 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[0] = 5 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[6] * B[0] = 6 */ + "ldr r10, [%[a], #24]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[1] = 6 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * B[2] = 6 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[3] = 6 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * B[4] = 6 */ + "ldr r10, [%[a], #8]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * B[5] = 6 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * B[6] = 6 */ + "ldr r11, [%[b], #24]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * B[7] = 7 */ + "ldr r11, [%[b], #28]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * B[6] = 7 */ + "ldr r11, [%[b], #24]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[5] = 7 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[4] = 7 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * B[3] = 7 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[2] = 7 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[6] * B[1] = 7 */ + "ldr r10, [%[a], #24]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[7] * B[0] = 7 */ + "ldr r10, [%[a], #28]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + "ldr r7, [%[a], #24]\n\t" + "ldr r9, [%[b], #24]\n\t" + /* A[7] * B[1] = 8 */ + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[2] = 8 */ + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[3] = 8 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[4] = 8 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[5] = 8 */ + "ldr r10, [%[a], #12]\n\t" + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[6] = 8 */ + "ldr r10, [%[a], #8]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[7] = 8 */ + "ldr r11, [%[b], #28]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r8, [%[a], #28]\n\t" + "mov lr, r11\n\t" + /* A[2] * B[7] = 9 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * B[6] = 9 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * B[5] = 9 */ + "ldr r10, [%[a], #16]\n\t" + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[4] = 9 */ + "ldr r10, [%[a], #20]\n\t" + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[6] * B[3] = 9 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[7] * B[2] = 9 */ + "ldr r11, [%[b], #8]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + /* A[7] * B[3] = 10 */ + "ldr r11, [%[b], #12]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[6] * B[4] = 10 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r7, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[5] = 10 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r10, r11\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * B[6] = 10 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[7] = 10 */ + "ldr r10, [%[a], #12]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + /* A[4] * B[7] = 11 */ + "ldr r10, [%[a], #16]\n\t" + "umull r3, r12, r10, lr\n\t" + "adds r6, r6, r3\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[6] = 11 */ + "ldr r10, [%[a], #20]\n\t" + "umull r3, r12, r10, r9\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[5] = 11 */ + "umull r3, r12, r7, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + /* A[7] * B[4] = 11 */ + "ldr r11, [%[b], #16]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r6, r6, r3\n\t" + "adcs r4, r4, r12\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[7] * B[5] = 12 */ + "ldr r11, [%[b], #20]\n\t" + "umull r3, r12, r8, r11\n\t" + "adds r4, r4, r3\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[6] * B[6] = 12 */ + "umull r3, r12, r7, r9\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * B[7] = 12 */ + "umull r3, r12, r10, lr\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, r12\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * B[7] = 13 */ + "umull r3, r12, r7, lr\n\t" + "adds r5, r5, r3\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + /* A[7] * B[6] = 13 */ + "umull r3, r12, r8, r9\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * B[7] = 14 */ + "umull r3, r12, r8, lr\n\t" + "adds r6, r6, r3\n\t" + "adc r4, r4, r12\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r3, r11, #31\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov lr, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "adds r4, r4, r3\n\t" + "mov %[b], #0\n\t" + "adcs r5, r5, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r5, r5, r3\n\t" + "mov %[b], #0\n\t" + "adcs r6, r6, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r6, r6, r3\n\t" + "mov %[b], #0\n\t" + "adcs r7, r7, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r7, r7, r3\n\t" + "mov %[b], #0\n\t" + "adcs r8, r8, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r8, r8, r3\n\t" + "mov %[b], #0\n\t" + "adcs r9, r9, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r9, r9, r3\n\t" + "mov %[b], #0\n\t" + "adcs r10, r10, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "add r12, r12, %[b]\n\t" + "adds r10, r10, r3\n\t" + "mov %[b], #0\n\t" + "adcs r11, r11, r12\n\t" + "adc %[b], %[b], #0\n\t" + "lsr r3, %[a], #31\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r3, r3, %[a], lsl #1\n\t" + "umull r3, r12, lr, r3\n\t" + "adds r11, r11, r3\n\t" + "adc r3, r12, %[b]\n\t" + /* Overflow */ + "lsl r3, r3, #1\n\t" + "orr r3, r3, r11, lsr #31\n\t" + "mul r3, r3, lr\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r3, r11, #31\n\t" + "and r3, r3, lr\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_sq(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Square */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[a], #8]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r12, [%[a], #16]\n\t" + /* A[0] * A[0] = 0 */ + "umull r4, r5, r7, r7\n\t" + "str r4, [sp]\n\t" + /* A[0] * A[1] = 1 */ + "umull r2, r3, r7, r8\n\t" + "mov r6, #0\n\t" + "adds r5, r5, r2\n\t" + "adc r6, r6, r3\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[1] * A[1] = 2 */ + "umull r2, r3, r8, r8\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + /* A[0] * A[2] = 2 */ + "umull r2, r3, r7, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * A[3] = 3 */ + "umull r2, r3, r7, r10\n\t" + "adds r4, r4, r2\n\t" + "adc r5, r5, r3\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[2] = 3 */ + "umull r2, r3, r8, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[2] * A[2] = 4 */ + "umull r2, r3, r9, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[3] = 4 */ + "umull r2, r3, r8, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * A[4] = 4 */ + "umull r2, r3, r7, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * A[5] = 5 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[4] = 5 */ + "umull r2, r3, r8, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[3] = 5 */ + "umull r2, r3, r9, r10\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[3] * A[3] = 6 */ + "umull r2, r3, r10, r10\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[4] = 6 */ + "umull r2, r3, r9, r12\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[5] = 6 */ + "umull r2, r3, r8, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * A[6] = 6 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * A[7] = 7 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[6] = 7 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * A[5] = 7 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[4] = 7 */ + "umull r2, r3, r10, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + /* A[4] * A[4] = 8 */ + "umull r2, r3, r12, r12\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[5] = 8 */ + "umull r2, r3, r10, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[6] = 8 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[7] = 8 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r7, [%[a], #20]\n\t" + /* A[2] * A[7] = 9 */ + "umull r2, r3, r9, r11\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * A[6] = 9 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * A[5] = 9 */ + "umull r2, r3, r12, r7\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + "mov r8, r11\n\t" + /* A[5] * A[5] = 10 */ + "umull r2, r3, r7, r7\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * A[6] = 10 */ + "umull r2, r3, r12, r8\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[7] = 10 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + "mov r9, r11\n\t" + /* A[4] * A[7] = 11 */ + "umull r2, r3, r12, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * A[6] = 11 */ + "umull r2, r3, r7, r8\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[6] * A[6] = 12 */ + "umull r2, r3, r8, r8\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * A[7] = 12 */ + "umull r2, r3, r7, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * A[7] = 13 */ + "umull r2, r3, r8, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * A[7] = 14 */ + "umull r2, r3, r9, r9\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r2, r11, #31\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov r12, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r4, r4, r2\n\t" + "mov lr, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r5, r5, r2\n\t" + "mov lr, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r6, r6, r2\n\t" + "mov lr, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r7, r7, r2\n\t" + "mov lr, #0\n\t" + "adcs r8, r8, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r8, r8, r2\n\t" + "mov lr, #0\n\t" + "adcs r9, r9, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r9, r9, r2\n\t" + "mov lr, #0\n\t" + "adcs r10, r10, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r10, r10, r2\n\t" + "mov lr, #0\n\t" + "adcs r11, r11, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #31\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r2, r2, %[a], lsl #1\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r11, r11, r2\n\t" + "adc r2, r3, lr\n\t" + /* Overflow */ + "lsl r2, r2, #1\n\t" + "orr r2, r2, r11, lsr #31\n\t" + "mul r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r2, r11, #31\n\t" + "and r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_mul121666(fe r, fe a) +{ + __asm__ __volatile__ ( + /* Multiply by 121666 */ + "ldrd r2, r3, [%[a]]\n\t" + "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [%[a], #16]\n\t" + "ldrd r8, r9, [%[a], #24]\n\t" + "movw lr, #0xdb42\n\t" + "movt lr, #1\n\t" + "umull r2, r10, r2, lr\n\t" + "umull r3, r12, r3, lr\n\t" + "adds r3, r3, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r4, r12, r4, lr\n\t" + "adds r4, r4, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r5, r12, r5, lr\n\t" + "adds r5, r5, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r6, r12, r6, lr\n\t" + "adds r6, r6, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r7, r12, r7, lr\n\t" + "adds r7, r7, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r8, r12, r8, lr\n\t" + "adds r8, r8, r10\n\t" + "adc r10, r12, #0\n\t" + "umull r9, r12, r9, lr\n\t" + "adds r9, r9, r10\n\t" + "adc r10, r12, #0\n\t" + "mov lr, #19\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "mul r10, r10, lr\n\t" + "and r9, r9, #0x7fffffff\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "strd r2, r3, [%[r]]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [%[r], #16]\n\t" + "strd r8, r9, [%[r], #24]\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + +void fe_sq2(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x40\n\t" + /* Square * 2 */ + "ldr r7, [%[a]]\n\t" + "ldr r8, [%[a], #4]\n\t" + "ldr r9, [%[a], #8]\n\t" + "ldr r10, [%[a], #12]\n\t" + "ldr r12, [%[a], #16]\n\t" + /* A[0] * A[0] = 0 */ + "umull r4, r5, r7, r7\n\t" + "str r4, [sp]\n\t" + /* A[0] * A[1] = 1 */ + "umull r2, r3, r7, r8\n\t" + "mov r6, #0\n\t" + "adds r5, r5, r2\n\t" + "adc r6, r6, r3\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #4]\n\t" + /* A[1] * A[1] = 2 */ + "umull r2, r3, r8, r8\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + /* A[0] * A[2] = 2 */ + "umull r2, r3, r7, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #8]\n\t" + /* A[0] * A[3] = 3 */ + "umull r2, r3, r7, r10\n\t" + "adds r4, r4, r2\n\t" + "adc r5, r5, r3\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[2] = 3 */ + "umull r2, r3, r8, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #12]\n\t" + /* A[2] * A[2] = 4 */ + "umull r2, r3, r9, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[3] = 4 */ + "umull r2, r3, r8, r10\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[0] * A[4] = 4 */ + "umull r2, r3, r7, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #16]\n\t" + /* A[0] * A[5] = 5 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[4] = 5 */ + "umull r2, r3, r8, r12\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[3] = 5 */ + "umull r2, r3, r9, r10\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #20]\n\t" + /* A[3] * A[3] = 6 */ + "umull r2, r3, r10, r10\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[2] * A[4] = 6 */ + "umull r2, r3, r9, r12\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[1] * A[5] = 6 */ + "umull r2, r3, r8, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[0] * A[6] = 6 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #24]\n\t" + /* A[0] * A[7] = 7 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r7, r11\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[6] = 7 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * A[5] = 7 */ + "ldr r11, [%[a], #20]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[4] = 7 */ + "umull r2, r3, r10, r12\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #28]\n\t" + /* A[4] * A[4] = 8 */ + "umull r2, r3, r12, r12\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * A[5] = 8 */ + "umull r2, r3, r10, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * A[6] = 8 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r9, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * A[7] = 8 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r8, r11\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #32]\n\t" + "ldr r7, [%[a], #20]\n\t" + /* A[2] * A[7] = 9 */ + "umull r2, r3, r9, r11\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[3] * A[6] = 9 */ + "ldr r11, [%[a], #24]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[4] * A[5] = 9 */ + "umull r2, r3, r12, r7\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #36]\n\t" + "mov r8, r11\n\t" + /* A[5] * A[5] = 10 */ + "umull r2, r3, r7, r7\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * A[6] = 10 */ + "umull r2, r3, r12, r8\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * A[7] = 10 */ + "ldr r11, [%[a], #28]\n\t" + "umull r2, r3, r10, r11\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #40]\n\t" + "mov r9, r11\n\t" + /* A[4] * A[7] = 11 */ + "umull r2, r3, r12, r9\n\t" + "adds r6, r6, r2\n\t" + "mov r5, #0\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * A[6] = 11 */ + "umull r2, r3, r7, r8\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "adds r6, r6, r2\n\t" + "adcs r4, r4, r3\n\t" + "adc r5, r5, #0\n\t" + "str r6, [sp, #44]\n\t" + /* A[6] * A[6] = 12 */ + "umull r2, r3, r8, r8\n\t" + "adds r4, r4, r2\n\t" + "mov r6, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + /* A[5] * A[7] = 12 */ + "umull r2, r3, r7, r9\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adc r6, r6, #0\n\t" + "str r4, [sp, #48]\n\t" + /* A[6] * A[7] = 13 */ + "umull r2, r3, r8, r9\n\t" + "adds r5, r5, r2\n\t" + "mov r4, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r2\n\t" + "adcs r6, r6, r3\n\t" + "adc r4, r4, #0\n\t" + "str r5, [sp, #52]\n\t" + /* A[7] * A[7] = 14 */ + "umull r2, r3, r9, r9\n\t" + "adds r6, r6, r2\n\t" + "adc r4, r4, r3\n\t" + "str r6, [sp, #56]\n\t" + "str r4, [sp, #60]\n\t" + /* Double and Reduce */ + /* Load bottom half */ + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "ldrd r8, r9, [sp, #16]\n\t" + "ldrd r10, r11, [sp, #24]\n\t" + "lsr r2, r11, #30\n\t" + "lsl r11, r11, #1\n\t" + "orr r11, r11, r10, lsr #31\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "lsl r9, r9, #1\n\t" + "orr r9, r9, r8, lsr #31\n\t" + "lsl r8, r8, #1\n\t" + "orr r8, r8, r7, lsr #31\n\t" + "lsl r7, r7, #1\n\t" + "orr r7, r7, r6, lsr #31\n\t" + "lsl r6, r6, #1\n\t" + "orr r6, r6, r5, lsr #31\n\t" + "lsl r5, r5, #1\n\t" + "orr r5, r5, r4, lsr #31\n\t" + "lsl r4, r4, #1\n\t" + "and r11, r11, #0x7fffffff\n\t" + "mov r12, #19\n\t" + "ldr %[a], [sp, #32]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r4, r4, r2\n\t" + "mov lr, #0\n\t" + "adcs r5, r5, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #36]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r5, r5, r2\n\t" + "mov lr, #0\n\t" + "adcs r6, r6, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #40]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r6, r6, r2\n\t" + "mov lr, #0\n\t" + "adcs r7, r7, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #44]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r7, r7, r2\n\t" + "mov lr, #0\n\t" + "adcs r8, r8, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #48]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r8, r8, r2\n\t" + "mov lr, #0\n\t" + "adcs r9, r9, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #52]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r9, r9, r2\n\t" + "mov lr, #0\n\t" + "adcs r10, r10, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #56]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "add r3, r3, lr\n\t" + "adds r10, r10, r2\n\t" + "mov lr, #0\n\t" + "adcs r11, r11, r3\n\t" + "adc lr, lr, #0\n\t" + "lsr r2, %[a], #30\n\t" + "ldr %[a], [sp, #60]\n\t" + "orr r2, r2, %[a], lsl #2\n\t" + "umull r2, r3, r12, r2\n\t" + "adds r11, r11, r2\n\t" + "adc r2, r3, lr\n\t" + /* Overflow */ + "lsl r2, r2, #1\n\t" + "orr r2, r2, r11, lsr #31\n\t" + "mul r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Reduce if top bit set */ + "asr r2, r11, #31\n\t" + "and r2, r2, r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + /* Store */ + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [%[r], #8]\n\t" + "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #24]\n\t" + "add sp, sp, #0x40\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_invert(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x88\n\t" + /* Invert */ + "str %[r], [sp, #128]\n\t" + "str %[a], [sp, #132]\n\t" + "mov r0, sp\n\t" + "ldr r1, [sp, #132]\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "ldr r1, [sp, #132]\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_fe_invert1_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert1_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_fe_invert2_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert2_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_fe_invert3_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert3_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_fe_invert4_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert4_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_fe_invert5_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert5_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_fe_invert6_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert6_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_fe_invert7_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert7_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #5\n\t" + "\n" + "L_fe_invert8_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_invert8_%=\n\t" + "ldr r0, [sp, #128]\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #132]\n\t" + "ldr %[r], [sp, #128]\n\t" + "add sp, sp, #0x88\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r4" + ); +} + +int curve25519(byte* r, byte* n, byte* a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0xbc\n\t" + "str %[r], [sp, #160]\n\t" + "str %[n], [sp, #164]\n\t" + "str %[a], [sp, #168]\n\t" + "mov %[n], #0\n\t" + "str %[n], [sp, #172]\n\t" + /* Set one */ + "mov r11, #1\n\t" + "mov r10, #0\n\t" + "strd r11, r10, [%[r]]\n\t" + "strd r10, r10, [%[r], #8]\n\t" + "strd r10, r10, [%[r], #16]\n\t" + "strd r10, r10, [%[r], #24]\n\t" + /* Set zero */ + "mov r10, #0\n\t" + "strd r10, r10, [sp]\n\t" + "strd r10, r10, [sp, #8]\n\t" + "strd r10, r10, [sp, #16]\n\t" + "strd r10, r10, [sp, #24]\n\t" + /* Set one */ + "mov r11, #1\n\t" + "mov r10, #0\n\t" + "strd r11, r10, [sp, #32]\n\t" + "strd r10, r10, [sp, #40]\n\t" + "strd r10, r10, [sp, #48]\n\t" + "strd r10, r10, [sp, #56]\n\t" + /* Copy */ + "ldrd r4, r5, [%[a]]\n\t" + "ldrd r6, r7, [%[a], #8]\n\t" + "strd r4, r5, [sp, #64]\n\t" + "strd r6, r7, [sp, #72]\n\t" + "ldrd r4, r5, [%[a], #16]\n\t" + "ldrd r6, r7, [%[a], #24]\n\t" + "strd r4, r5, [sp, #80]\n\t" + "strd r6, r7, [sp, #88]\n\t" + "mov %[n], #30\n\t" + "str %[n], [sp, #180]\n\t" + "mov %[a], #28\n\t" + "str %[a], [sp, #176]\n\t" + "\n" + "L_curve25519_words_%=: \n\t" + "\n" + "L_curve25519_bits_%=: \n\t" + "ldr %[n], [sp, #164]\n\t" + "ldr %[a], [%[n], r2]\n\t" + "ldr %[n], [sp, #180]\n\t" + "lsr %[a], %[a], %[n]\n\t" + "and %[a], %[a], #1\n\t" + "str %[a], [sp, #184]\n\t" + "ldr %[n], [sp, #172]\n\t" + "eor %[n], %[n], %[a]\n\t" + "str %[n], [sp, #172]\n\t" + "ldr %[r], [sp, #160]\n\t" + /* Conditional Swap */ + "neg %[n], %[n]\n\t" + "ldrd r4, r5, [%[r]]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r]]\n\t" + "strd r6, r7, [sp, #64]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "strd r6, r7, [sp, #72]\n\t" + "ldrd r4, r5, [%[r], #16]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #16]\n\t" + "strd r6, r7, [sp, #80]\n\t" + "ldrd r4, r5, [%[r], #24]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [%[r], #24]\n\t" + "strd r6, r7, [sp, #88]\n\t" + "ldr %[n], [sp, #172]\n\t" + /* Conditional Swap */ + "neg %[n], %[n]\n\t" + "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp]\n\t" + "strd r6, r7, [sp, #32]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #8]\n\t" + "strd r6, r7, [sp, #40]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #16]\n\t" + "strd r6, r7, [sp, #48]\n\t" + "ldrd r4, r5, [sp, #24]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" + "strd r4, r5, [sp, #24]\n\t" + "strd r6, r7, [sp, #56]\n\t" + "ldr %[n], [sp, #184]\n\t" + "str %[n], [sp, #172]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [%[r]]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r]]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #128]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #8]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r], #8]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #136]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #16]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [%[r], #16]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #144]\n\t" + /* Add */ + "ldrd r4, r5, [%[r], #24]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [%[r]]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r], #8]\n\t" + "ldrd r4, r5, [%[r], #16]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [%[r], #16]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [%[r], #24]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #128]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #128]\n\t" + "ldrd r4, r5, [sp, #136]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #136]\n\t" + "ldrd r4, r5, [sp, #144]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #144]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #152]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [sp, #64]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #96]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #72]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #8]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #104]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #80]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #16]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #112]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #88]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #8]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #16]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [sp, #24]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #96]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #96]\n\t" + "ldrd r4, r5, [sp, #104]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #104]\n\t" + "ldrd r4, r5, [sp, #112]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #112]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #120]\n\t" + "ldr r2, [sp, #160]\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0x80\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq\n\t" + "ldr r1, [sp, #160]\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq\n\t" + /* Add-Sub */ + /* Add */ + "ldrd r4, r5, [sp, #32]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #64]\n\t" + /* Sub */ + "subs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #40]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #72]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #8]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #48]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "mov r3, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "strd r8, r9, [sp, #80]\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "mov r12, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc r12, r12, #0\n\t" + "strd r10, r11, [sp, #16]\n\t" + /* Add */ + "ldrd r4, r5, [sp, #56]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r3, r3, #-1\n\t" + "adcs r8, r4, r6\n\t" + "adc r9, r5, r7\n\t" + /* Sub */ + "adds r12, r12, #-1\n\t" + "sbcs r10, r4, r6\n\t" + "sbc r11, r5, r7\n\t" + "mov r3, #-19\n\t" + "asr %[a], r9, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp, #64]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #64]\n\t" + "ldrd r4, r5, [sp, #72]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #72]\n\t" + "ldrd r4, r5, [sp, #80]\n\t" + "sbcs r4, r4, %[a]\n\t" + "sbcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #80]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbc r9, r9, r12\n\t" + "strd r8, r9, [sp, #88]\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #8]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" + "adcs r4, r4, %[a]\n\t" + "adcs r5, r5, %[a]\n\t" + "strd r4, r5, [sp, #16]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r10, r11, [sp, #24]\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul\n\t" + /* Sub */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "ldrd r8, r9, [sp, #96]\n\t" + "ldrd r10, r11, [sp, #104]\n\t" + "subs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [sp, #128]\n\t" + "strd r10, r11, [sp, #136]\n\t" + "ldrd r4, r5, [sp, #144]\n\t" + "ldrd r6, r7, [sp, #152]\n\t" + "ldrd r8, r9, [sp, #112]\n\t" + "ldrd r10, r11, [sp, #120]\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "adds r4, r4, r3\n\t" + "adcs r5, r5, %[a]\n\t" + "adcs r6, r6, %[a]\n\t" + "adcs r7, r7, %[a]\n\t" + "adcs r8, r8, %[a]\n\t" + "adcs r9, r9, %[a]\n\t" + "adcs r10, r10, %[a]\n\t" + "adc r11, r11, r12\n\t" + "strd r4, r5, [sp, #128]\n\t" + "strd r6, r7, [sp, #136]\n\t" + "strd r8, r9, [sp, #144]\n\t" + "strd r10, r11, [sp, #152]\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0\n\t" + "bl fe_sq\n\t" + /* Multiply by 121666 */ + "ldrd r4, r5, [sp, #128]\n\t" + "ldrd r6, r7, [sp, #136]\n\t" + "ldrd r8, r9, [sp, #144]\n\t" + "ldrd r10, r11, [sp, #152]\n\t" + "movw r12, #0xdb42\n\t" + "movt r12, #1\n\t" + "umull r4, %[a], r4, r12\n\t" + "umull r5, r3, r5, r12\n\t" + "adds r5, r5, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r6, r3, r6, r12\n\t" + "adds r6, r6, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r7, r3, r7, r12\n\t" + "adds r7, r7, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r8, r3, r8, r12\n\t" + "adds r8, r8, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r9, r3, r9, r12\n\t" + "adds r9, r9, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r10, r3, r10, r12\n\t" + "adds r10, r10, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "umull r11, r3, r11, r12\n\t" + "adds r11, r11, %[a]\n\t" + "adc %[a], r3, #0\n\t" + "mov r12, #19\n\t" + "lsl %[a], %[a], #1\n\t" + "orr %[a], %[a], r11, lsr #31\n\t" + "mul %[a], %[a], r12\n\t" + "and r11, r11, #0x7fffffff\n\t" + "adds r4, r4, %[a]\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + "strd r4, r5, [sp, #32]\n\t" + "strd r6, r7, [sp, #40]\n\t" + "strd r8, r9, [sp, #48]\n\t" + "strd r10, r11, [sp, #56]\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq\n\t" + /* Add */ + "ldrd r4, r5, [sp, #96]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "ldrd r8, r9, [sp, #32]\n\t" + "ldrd r10, r11, [sp, #40]\n\t" + "adds r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [sp, #96]\n\t" + "strd r10, r11, [sp, #104]\n\t" + "ldrd r4, r5, [sp, #112]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "ldrd r8, r9, [sp, #48]\n\t" + "ldrd r10, r11, [sp, #56]\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" + "mov r3, #-19\n\t" + "asr %[a], r11, #31\n\t" + /* Mask the modulus */ + "and r3, %[a], r3\n\t" + "and r12, %[a], #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd r4, r5, [sp, #96]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, %[a]\n\t" + "sbcs r6, r6, %[a]\n\t" + "sbcs r7, r7, %[a]\n\t" + "sbcs r8, r8, %[a]\n\t" + "sbcs r9, r9, %[a]\n\t" + "sbcs r10, r10, %[a]\n\t" + "sbc r11, r11, r12\n\t" + "strd r4, r5, [sp, #96]\n\t" + "strd r6, r7, [sp, #104]\n\t" + "strd r8, r9, [sp, #112]\n\t" + "strd r10, r11, [sp, #120]\n\t" + "add r2, sp, #0\n\t" + "ldr r1, [sp, #168]\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #176]\n\t" + "ldr %[n], [sp, #180]\n\t" + "subs %[n], %[n], #1\n\t" + "str %[n], [sp, #180]\n\t" + "bge L_curve25519_bits_%=\n\t" + "mov %[n], #31\n\t" + "str %[n], [sp, #180]\n\t" + "subs %[a], %[a], #4\n\t" + "str %[a], [sp, #176]\n\t" + "bge L_curve25519_words_%=\n\t" + /* Invert */ + "add r0, sp, #32\n\t" + "add r1, sp, #0\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_curve25519_inv_1_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_1_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_curve25519_inv_2_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_2_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_curve25519_inv_3_%=: \n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x80\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_3_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_curve25519_inv_4_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_4_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_curve25519_inv_5_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_5_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_curve25519_inv_6_%=: \n\t" + "add r0, sp, #0x80\n\t" + "add r1, sp, #0x80\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_6_%=\n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r2, sp, #0x60\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_curve25519_inv_7_%=: \n\t" + "add r0, sp, #0x60\n\t" + "add r1, sp, #0x60\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_7_%=\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r2, sp, #0x40\n\t" + "bl fe_mul\n\t" + "mov r4, #5\n\t" + "\n" + "L_curve25519_inv_8_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_curve25519_inv_8_%=\n\t" + "add r0, sp, #0\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "add r2, sp, #0\n\t" + "ldr r1, [sp, #160]\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul\n\t" + "mov r0, #0\n\t" + "add sp, sp, #0xbc\n\t" + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)r; +} + +void fe_pow22523(fe r, const fe a) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x68\n\t" + /* pow22523 */ + "str %[r], [sp, #96]\n\t" + "str %[a], [sp, #100]\n\t" + "mov r0, sp\n\t" + "ldr r1, [sp, #100]\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "add r0, sp, #32\n\t" + "ldr r1, [sp, #100]\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #4\n\t" + "\n" + "L_fe_pow22523_1_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_1_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #9\n\t" + "\n" + "L_fe_pow22523_2_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_2_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #19\n\t" + "\n" + "L_fe_pow22523_3_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_3_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #10\n\t" + "\n" + "L_fe_pow22523_4_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_4_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #32\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "mov r4, #49\n\t" + "\n" + "L_fe_pow22523_5_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_5_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "mov r4, #0x63\n\t" + "\n" + "L_fe_pow22523_6_%=: \n\t" + "add r0, sp, #0x40\n\t" + "add r1, sp, #0x40\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_6_%=\n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r2, sp, #32\n\t" + "bl fe_mul\n\t" + "mov r4, #50\n\t" + "\n" + "L_fe_pow22523_7_%=: \n\t" + "add r0, sp, #32\n\t" + "add r1, sp, #32\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_7_%=\n\t" + "mov r0, sp\n\t" + "add r1, sp, #32\n\t" + "mov r2, sp\n\t" + "bl fe_mul\n\t" + "mov r4, #2\n\t" + "\n" + "L_fe_pow22523_8_%=: \n\t" + "mov r0, sp\n\t" + "mov r1, sp\n\t" + "bl fe_sq\n\t" + "sub r4, r4, #1\n\t" + "cmp r4, #0\n\t" + "bne L_fe_pow22523_8_%=\n\t" + "ldr r0, [sp, #96]\n\t" + "mov r1, sp\n\t" + "ldr r2, [sp, #100]\n\t" + "bl fe_mul\n\t" + "ldr %[a], [sp, #100]\n\t" + "ldr %[r], [sp, #96]\n\t" + "add sp, sp, #0x68\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r4" + ); +} + +void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[px], [sp, #12]\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #28]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "lr" + ); +} + +void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r2, [sp, #36]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #32]\n\t" + "ldr r1, [sp, #28]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #36]\n\t" + "ldr r1, [sp, #32]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #28]\n\t" + "ldr r1, [sp, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "lr" + ); +} + +void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) +{ + __asm__ __volatile__ ( + "sub sp, sp, #16\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r1, [sp, #88]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_sq\n\t" + "ldr r1, [sp, #92]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_sq\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #88]\n\t" + "ldr r2, [sp, #92]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_sq\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #8]\n\t" + "ldr r2, [sp]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r2]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #8]\n\t" + "ldrd r5, r6, [r2, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r2, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #24]\n\t" + "ldrd r5, r6, [r2, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #12]\n\t" + "ldr r2, [sp, #4]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r1, [sp, #96]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_sq2\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "ldrd r7, r8, [r1]\n\t" + "ldrd r9, r10, [r1, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "add sp, sp, #16\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #124]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #128]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #116]\n\t" + "ldr r1, [sp, #120]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #112]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #12]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r1]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #8]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #16]\n\t" + "ldrd r5, r6, [r1, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r0, #24]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #32\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #108]\n\t" + "ldr r2, [sp, #104]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #128]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #124]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #116]\n\t" + "ldr r1, [sp, #120]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #112]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #32\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qxy2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x60\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #192]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #196]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #180]\n\t" + "ldr r1, [sp, #188]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #184]\n\t" + "ldr r1, [sp, #176]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "add r0, sp, #16\n\t" + "ldr r1, [sp]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #8]\n\t" + "ldr r1, [sp, #12]\n\t" + "add r2, sp, #16\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r1]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r1, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #0x60\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x60\n\t" + "str %[rx], [sp]\n\t" + "str %[ry], [sp, #4]\n\t" + "str %[rz], [sp, #8]\n\t" + "str %[rt], [sp, #12]\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Add */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "adds r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "adcs r7, %[rt], r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "adc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r2, [sp, #168]\n\t" + /* Sub */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r2]\n\t" + "ldrd r9, r10, [r2, #8]\n\t" + "subs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbcs r10, r6, r10\n\t" + "strd r7, r8, [r0]\n\t" + "strd r9, r10, [r0, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "ldrd r5, r6, [r1, #24]\n\t" + "ldrd r7, r8, [r2, #16]\n\t" + "ldrd r9, r10, [r2, #24]\n\t" + "sbcs r7, %[rt], r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "sbc r10, r6, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r11\n\t" + "adcs r6, r6, r11\n\t" + "adcs r7, r7, r11\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r2, [sp, #196]\n\t" + "ldr r1, [sp]\n\t" + "ldr r0, [sp, #8]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #192]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r0, [sp, #4]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #180]\n\t" + "ldr r1, [sp, #188]\n\t" + "ldr r0, [sp, #12]\n\t" + "bl fe_mul\n\t" + "ldr r2, [sp, #184]\n\t" + "ldr r1, [sp, #176]\n\t" + "ldr r0, [sp]\n\t" + "bl fe_mul\n\t" + "add r0, sp, #16\n\t" + "ldr r1, [sp]\n\t" + /* Double */ + "ldrd %[rt], r4, [r1]\n\t" + "ldrd r5, r6, [r1, #8]\n\t" + "ldrd r7, r8, [r1, #16]\n\t" + "ldrd r9, r10, [r1, #24]\n\t" + "adds %[rt], %[rt], %[rt]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r10, r10\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r11\n\t" + "sbcs r8, r8, r11\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r10, r10, lr\n\t" + "strd %[rt], r4, [r0]\n\t" + "strd r5, r6, [r0, #8]\n\t" + "strd r7, r8, [r0, #16]\n\t" + "strd r9, r10, [r0, #24]\n\t" + "ldr r0, [sp, #4]\n\t" + "ldr r1, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "ldr r0, [sp, #12]\n\t" + "ldr r1, [sp, #8]\n\t" + "add r2, sp, #16\n\t" + /* Add-Sub */ + /* Add */ + "ldrd %[rt], r4, [r2]\n\t" + "ldrd r5, r6, [r0]\n\t" + "adds r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0]\n\t" + /* Sub */ + "subs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #8]\n\t" + "ldrd r5, r6, [r0, #8]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #8]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #8]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #16]\n\t" + "ldrd r5, r6, [r0, #16]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "mov r12, #0\n\t" + "adcs r8, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "strd r7, r8, [r0, #16]\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "mov lr, #0\n\t" + "sbcs r10, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "strd r9, r10, [r1, #16]\n\t" + /* Add */ + "ldrd %[rt], r4, [r2, #24]\n\t" + "ldrd r5, r6, [r0, #24]\n\t" + "adds r12, r12, #-1\n\t" + "adcs r7, %[rt], r5\n\t" + "adc r8, r4, r6\n\t" + /* Sub */ + "adds lr, lr, #-1\n\t" + "sbcs r9, %[rt], r5\n\t" + "sbc r10, r4, r6\n\t" + "mov r12, #-19\n\t" + "asr r11, r8, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Sub modulus (if overflow) */ + "ldrd %[rt], r4, [r0]\n\t" + "subs %[rt], %[rt], r12\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r4, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #8]\n\t" + "ldrd %[rt], r4, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r11\n\t" + "sbcs r4, r4, r11\n\t" + "strd %[rt], r4, [r0, #16]\n\t" + "sbcs r7, r7, r11\n\t" + "sbc r8, r8, lr\n\t" + "strd r7, r8, [r0, #24]\n\t" + "mov r12, #-19\n\t" + "asr r11, r10, #31\n\t" + /* Mask the modulus */ + "and r12, r11, r12\n\t" + "and lr, r11, #0x7fffffff\n\t" + /* Add modulus (if underflow) */ + "ldrd %[rt], r4, [r1]\n\t" + "adds %[rt], %[rt], r12\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r4, [r1, #8]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #8]\n\t" + "ldrd %[rt], r4, [r1, #16]\n\t" + "adcs %[rt], %[rt], r11\n\t" + "adcs r4, r4, r11\n\t" + "strd %[rt], r4, [r1, #16]\n\t" + "adcs r9, r9, r11\n\t" + "adc r10, r10, lr\n\t" + "strd r9, r10, [r1, #24]\n\t" + "add sp, sp, #0x60\n\t" + : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + : + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)qz; + (void)qt2d; + (void)qyplusx; + (void)qyminusx; +} + +#endif /* !__aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S new file mode 100644 index 000000000..8e79c3e18 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -0,0 +1,5332 @@ +/* armv8-32-sha512-asm + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S + */ +#ifndef __aarch64__ +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA512_transform_len_k, %object + .size L_SHA512_transform_len_k, 640 + .align 3 +L_SHA512_transform_len_k: + .word 0xd728ae22 + .word 0x428a2f98 + .word 0x23ef65cd + .word 0x71374491 + .word 0xec4d3b2f + .word 0xb5c0fbcf + .word 0x8189dbbc + .word 0xe9b5dba5 + .word 0xf348b538 + .word 0x3956c25b + .word 0xb605d019 + .word 0x59f111f1 + .word 0xaf194f9b + .word 0x923f82a4 + .word 0xda6d8118 + .word 0xab1c5ed5 + .word 0xa3030242 + .word 0xd807aa98 + .word 0x45706fbe + .word 0x12835b01 + .word 0x4ee4b28c + .word 0x243185be + .word 0xd5ffb4e2 + .word 0x550c7dc3 + .word 0xf27b896f + .word 0x72be5d74 + .word 0x3b1696b1 + .word 0x80deb1fe + .word 0x25c71235 + .word 0x9bdc06a7 + .word 0xcf692694 + .word 0xc19bf174 + .word 0x9ef14ad2 + .word 0xe49b69c1 + .word 0x384f25e3 + .word 0xefbe4786 + .word 0x8b8cd5b5 + .word 0xfc19dc6 + .word 0x77ac9c65 + .word 0x240ca1cc + .word 0x592b0275 + .word 0x2de92c6f + .word 0x6ea6e483 + .word 0x4a7484aa + .word 0xbd41fbd4 + .word 0x5cb0a9dc + .word 0x831153b5 + .word 0x76f988da + .word 0xee66dfab + .word 0x983e5152 + .word 0x2db43210 + .word 0xa831c66d + .word 0x98fb213f + .word 0xb00327c8 + .word 0xbeef0ee4 + .word 0xbf597fc7 + .word 0x3da88fc2 + .word 0xc6e00bf3 + .word 0x930aa725 + .word 0xd5a79147 + .word 0xe003826f + .word 0x6ca6351 + .word 0xa0e6e70 + .word 0x14292967 + .word 0x46d22ffc + .word 0x27b70a85 + .word 0x5c26c926 + .word 0x2e1b2138 + .word 0x5ac42aed + .word 0x4d2c6dfc + .word 0x9d95b3df + .word 0x53380d13 + .word 0x8baf63de + .word 0x650a7354 + .word 0x3c77b2a8 + .word 0x766a0abb + .word 0x47edaee6 + .word 0x81c2c92e + .word 0x1482353b + .word 0x92722c85 + .word 0x4cf10364 + .word 0xa2bfe8a1 + .word 0xbc423001 + .word 0xa81a664b + .word 0xd0f89791 + .word 0xc24b8b70 + .word 0x654be30 + .word 0xc76c51a3 + .word 0xd6ef5218 + .word 0xd192e819 + .word 0x5565a910 + .word 0xd6990624 + .word 0x5771202a + .word 0xf40e3585 + .word 0x32bbd1b8 + .word 0x106aa070 + .word 0xb8d2d0c8 + .word 0x19a4c116 + .word 0x5141ab53 + .word 0x1e376c08 + .word 0xdf8eeb99 + .word 0x2748774c + .word 0xe19b48a8 + .word 0x34b0bcb5 + .word 0xc5c95a63 + .word 0x391c0cb3 + .word 0xe3418acb + .word 0x4ed8aa4a + .word 0x7763e373 + .word 0x5b9cca4f + .word 0xd6b2b8a3 + .word 0x682e6ff3 + .word 0x5defb2fc + .word 0x748f82ee + .word 0x43172f60 + .word 0x78a5636f + .word 0xa1f0ab72 + .word 0x84c87814 + .word 0x1a6439ec + .word 0x8cc70208 + .word 0x23631e28 + .word 0x90befffa + .word 0xde82bde9 + .word 0xa4506ceb + .word 0xb2c67915 + .word 0xbef9a3f7 + .word 0xe372532b + .word 0xc67178f2 + .word 0xea26619c + .word 0xca273ece + .word 0x21c0c207 + .word 0xd186b8c7 + .word 0xcde0eb1e + .word 0xeada7dd6 + .word 0xee6ed178 + .word 0xf57d4f7f + .word 0x72176fba + .word 0x6f067aa + .word 0xa2c898a6 + .word 0xa637dc5 + .word 0xbef90dae + .word 0x113f9804 + .word 0x131c471b + .word 0x1b710b35 + .word 0x23047d84 + .word 0x28db77f5 + .word 0x40c72493 + .word 0x32caab7b + .word 0x15c9bebc + .word 0x3c9ebe0a + .word 0x9c100d4c + .word 0x431d67c4 + .word 0xcb3e42b6 + .word 0x4cc5d4be + .word 0xfc657e2a + .word 0x597f299c + .word 0x3ad6faec + .word 0x5fcb6fab + .word 0x4a475817 + .word 0x6c44198c + .text + .align 2 + .globl Transform_Sha512_Len + .type Transform_Sha512_Len, %function +Transform_Sha512_Len: + push {r4, r5, r6, r7, r8, r9, r10, lr} + sub sp, sp, #0xc0 + adr r3, L_SHA512_transform_len_k + # Copy digest to add in at end + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + ldrd r8, r9, [r0, #24] + str r12, [sp, #128] + str lr, [sp, #132] + strd r4, r5, [sp, #136] + strd r6, r7, [sp, #144] + strd r8, r9, [sp, #152] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + ldrd r8, r9, [r0, #56] + str r12, [sp, #160] + str lr, [sp, #164] + strd r4, r5, [sp, #168] + strd r6, r7, [sp, #176] + strd r8, r9, [sp, #184] + # Start of loop processing a block +L_sha512_len_neon_begin: + # Load, Reverse and Store W + ldr r12, [r1] + ldr lr, [r1, #4] + ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r1, #16] + ldrd r8, r9, [r1, #24] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str lr, [sp] + str r12, [sp, #4] + str r5, [sp, #8] + str r4, [sp, #12] + str r7, [sp, #16] + str r6, [sp, #20] + str r9, [sp, #24] + str r8, [sp, #28] + ldr r12, [r1, #32] + ldr lr, [r1, #36] + ldrd r4, r5, [r1, #40] + ldrd r6, r7, [r1, #48] + ldrd r8, r9, [r1, #56] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str lr, [sp, #32] + str r12, [sp, #36] + str r5, [sp, #40] + str r4, [sp, #44] + str r7, [sp, #48] + str r6, [sp, #52] + str r9, [sp, #56] + str r8, [sp, #60] + ldr r12, [r1, #64] + ldr lr, [r1, #68] + ldrd r4, r5, [r1, #72] + ldrd r6, r7, [r1, #80] + ldrd r8, r9, [r1, #88] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str lr, [sp, #64] + str r12, [sp, #68] + str r5, [sp, #72] + str r4, [sp, #76] + str r7, [sp, #80] + str r6, [sp, #84] + str r9, [sp, #88] + str r8, [sp, #92] + ldr r12, [r1, #96] + ldr lr, [r1, #100] + ldrd r4, r5, [r1, #104] + ldrd r6, r7, [r1, #112] + ldrd r8, r9, [r1, #120] + rev r12, r12 + rev lr, lr + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + rev r8, r8 + rev r9, r9 + str lr, [sp, #96] + str r12, [sp, #100] + str r5, [sp, #104] + str r4, [sp, #108] + str r7, [sp, #112] + str r6, [sp, #116] + str r9, [sp, #120] + str r8, [sp, #124] + # Pre-calc: b ^ c + ldrd r8, r9, [r0, #8] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r8, r8, r12 + eor r9, r9, lr + mov r10, #4 + # Start of 16 rounds +L_sha512_len_neon_start: + # Round 0 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r6, r7, [sp] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #24] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0] + ldr lr, [r0, #4] + strd r6, r7, [r0, #24] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0] + ldrd r4, r5, [r0, #8] + str r12, [r0, #56] + str lr, [r0, #60] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #56] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #56] + mov r8, r6 + mov r9, r7 + # Calc new W[0] + ldr r12, [sp, #112] + ldr lr, [sp, #116] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp] + ldr lr, [sp, #4] + ldrd r6, r7, [sp, #72] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp] + str lr, [sp, #4] + ldr r12, [sp, #8] + ldr lr, [sp, #12] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp] + ldr lr, [sp, #4] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp] + str lr, [sp, #4] + # Round 1 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r0, #40] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r6, r7, [sp, #8] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #8] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #16] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #56] + ldr lr, [r0, #60] + strd r6, r7, [r0, #16] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #56] + ldrd r4, r5, [r0] + str r12, [r0, #48] + str lr, [r0, #52] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #48] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #48] + mov r8, r6 + mov r9, r7 + # Calc new W[1] + ldr r12, [sp, #120] + ldr lr, [sp, #124] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #8] + ldr lr, [sp, #12] + ldrd r6, r7, [sp, #80] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #8] + str lr, [sp, #12] + ldr r12, [sp, #16] + ldr lr, [sp, #20] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #8] + ldr lr, [sp, #12] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #8] + str lr, [sp, #12] + # Round 2 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r0, #32] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r6, r7, [sp, #16] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #16] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #8] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #48] + ldr lr, [r0, #52] + strd r6, r7, [r0, #8] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #48] + ldrd r4, r5, [r0, #56] + str r12, [r0, #40] + str lr, [r0, #44] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #40] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #40] + mov r8, r6 + mov r9, r7 + # Calc new W[2] + ldr r12, [sp] + ldr lr, [sp, #4] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #16] + ldr lr, [sp, #20] + ldrd r6, r7, [sp, #88] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #16] + str lr, [sp, #20] + ldr r12, [sp, #24] + ldr lr, [sp, #28] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #16] + ldr lr, [sp, #20] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #16] + str lr, [sp, #20] + # Round 3 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r6, r7, [sp, #24] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #24] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #40] + ldr lr, [r0, #44] + strd r6, r7, [r0] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #40] + ldrd r4, r5, [r0, #48] + str r12, [r0, #32] + str lr, [r0, #36] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #32] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #32] + mov r8, r6 + mov r9, r7 + # Calc new W[3] + ldr r12, [sp, #8] + ldr lr, [sp, #12] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #24] + ldr lr, [sp, #28] + ldrd r6, r7, [sp, #96] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #24] + str lr, [sp, #28] + ldr r12, [sp, #32] + ldr lr, [sp, #36] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #24] + ldr lr, [sp, #28] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #24] + str lr, [sp, #28] + # Round 4 + ldr r12, [r0] + ldr lr, [r0, #4] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r6, r7, [sp, #32] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #32] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #56] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #32] + ldr lr, [r0, #36] + strd r6, r7, [r0, #56] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #32] + ldrd r4, r5, [r0, #40] + str r12, [r0, #24] + str lr, [r0, #28] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #24] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #24] + mov r8, r6 + mov r9, r7 + # Calc new W[4] + ldr r12, [sp, #16] + ldr lr, [sp, #20] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #32] + ldr lr, [sp, #36] + ldrd r6, r7, [sp, #104] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #32] + str lr, [sp, #36] + ldr r12, [sp, #40] + ldr lr, [sp, #44] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #32] + ldr lr, [sp, #36] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #32] + str lr, [sp, #36] + # Round 5 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r4, r5, [r0] + ldrd r6, r7, [r0, #8] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r6, r7, [sp, #40] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #40] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #48] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #24] + ldr lr, [r0, #28] + strd r6, r7, [r0, #48] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #24] + ldrd r4, r5, [r0, #32] + str r12, [r0, #16] + str lr, [r0, #20] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #16] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #16] + mov r8, r6 + mov r9, r7 + # Calc new W[5] + ldr r12, [sp, #24] + ldr lr, [sp, #28] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #40] + ldr lr, [sp, #44] + ldrd r6, r7, [sp, #112] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #40] + str lr, [sp, #44] + ldr r12, [sp, #48] + ldr lr, [sp, #52] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #40] + ldr lr, [sp, #44] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #40] + str lr, [sp, #44] + # Round 6 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r0] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r6, r7, [sp, #48] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #48] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #40] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #16] + ldr lr, [r0, #20] + strd r6, r7, [r0, #40] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #16] + ldrd r4, r5, [r0, #24] + str r12, [r0, #8] + str lr, [r0, #12] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #8] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #8] + mov r8, r6 + mov r9, r7 + # Calc new W[6] + ldr r12, [sp, #32] + ldr lr, [sp, #36] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #48] + ldr lr, [sp, #52] + ldrd r6, r7, [sp, #120] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #48] + str lr, [sp, #52] + ldr r12, [sp, #56] + ldr lr, [sp, #60] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #48] + ldr lr, [sp, #52] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #48] + str lr, [sp, #52] + # Round 7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r0, #56] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r6, r7, [sp, #56] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #56] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #32] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #8] + ldr lr, [r0, #12] + strd r6, r7, [r0, #32] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #8] + ldrd r4, r5, [r0, #16] + str r12, [r0] + str lr, [r0, #4] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0] + mov r8, r6 + mov r9, r7 + # Calc new W[7] + ldr r12, [sp, #40] + ldr lr, [sp, #44] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #56] + ldr lr, [sp, #60] + ldrd r6, r7, [sp] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #56] + str lr, [sp, #60] + ldr r12, [sp, #64] + ldr lr, [sp, #68] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #56] + ldr lr, [sp, #60] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #56] + str lr, [sp, #60] + # Round 8 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r6, r7, [sp, #64] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #64] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #24] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0] + ldr lr, [r0, #4] + strd r6, r7, [r0, #24] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0] + ldrd r4, r5, [r0, #8] + str r12, [r0, #56] + str lr, [r0, #60] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #56] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #56] + mov r8, r6 + mov r9, r7 + # Calc new W[8] + ldr r12, [sp, #48] + ldr lr, [sp, #52] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #64] + ldr lr, [sp, #68] + ldrd r6, r7, [sp, #8] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #64] + str lr, [sp, #68] + ldr r12, [sp, #72] + ldr lr, [sp, #76] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #64] + ldr lr, [sp, #68] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #64] + str lr, [sp, #68] + # Round 9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r0, #40] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r6, r7, [sp, #72] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #72] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #16] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #56] + ldr lr, [r0, #60] + strd r6, r7, [r0, #16] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #56] + ldrd r4, r5, [r0] + str r12, [r0, #48] + str lr, [r0, #52] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #48] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #48] + mov r8, r6 + mov r9, r7 + # Calc new W[9] + ldr r12, [sp, #56] + ldr lr, [sp, #60] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #72] + ldr lr, [sp, #76] + ldrd r6, r7, [sp, #16] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #72] + str lr, [sp, #76] + ldr r12, [sp, #80] + ldr lr, [sp, #84] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #72] + ldr lr, [sp, #76] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #72] + str lr, [sp, #76] + # Round 10 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r0, #32] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r6, r7, [sp, #80] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #80] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #8] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #48] + ldr lr, [r0, #52] + strd r6, r7, [r0, #8] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #48] + ldrd r4, r5, [r0, #56] + str r12, [r0, #40] + str lr, [r0, #44] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #40] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #40] + mov r8, r6 + mov r9, r7 + # Calc new W[10] + ldr r12, [sp, #64] + ldr lr, [sp, #68] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #80] + ldr lr, [sp, #84] + ldrd r6, r7, [sp, #24] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #80] + str lr, [sp, #84] + ldr r12, [sp, #88] + ldr lr, [sp, #92] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #80] + ldr lr, [sp, #84] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #80] + str lr, [sp, #84] + # Round 11 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r6, r7, [sp, #88] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #88] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #40] + ldr lr, [r0, #44] + strd r6, r7, [r0] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #40] + ldrd r4, r5, [r0, #48] + str r12, [r0, #32] + str lr, [r0, #36] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #32] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #32] + mov r8, r6 + mov r9, r7 + # Calc new W[11] + ldr r12, [sp, #72] + ldr lr, [sp, #76] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #88] + ldr lr, [sp, #92] + ldrd r6, r7, [sp, #32] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #88] + str lr, [sp, #92] + ldr r12, [sp, #96] + ldr lr, [sp, #100] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #88] + ldr lr, [sp, #92] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #88] + str lr, [sp, #92] + # Round 12 + ldr r12, [r0] + ldr lr, [r0, #4] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r6, r7, [sp, #96] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #96] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #56] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #32] + ldr lr, [r0, #36] + strd r6, r7, [r0, #56] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #32] + ldrd r4, r5, [r0, #40] + str r12, [r0, #24] + str lr, [r0, #28] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #24] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #24] + mov r8, r6 + mov r9, r7 + # Calc new W[12] + ldr r12, [sp, #80] + ldr lr, [sp, #84] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #96] + ldr lr, [sp, #100] + ldrd r6, r7, [sp, #40] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #96] + str lr, [sp, #100] + ldr r12, [sp, #104] + ldr lr, [sp, #108] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #96] + ldr lr, [sp, #100] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #96] + str lr, [sp, #100] + # Round 13 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r4, r5, [r0] + ldrd r6, r7, [r0, #8] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r6, r7, [sp, #104] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #104] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #48] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #24] + ldr lr, [r0, #28] + strd r6, r7, [r0, #48] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #24] + ldrd r4, r5, [r0, #32] + str r12, [r0, #16] + str lr, [r0, #20] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #16] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #16] + mov r8, r6 + mov r9, r7 + # Calc new W[13] + ldr r12, [sp, #88] + ldr lr, [sp, #92] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #104] + ldr lr, [sp, #108] + ldrd r6, r7, [sp, #48] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #104] + str lr, [sp, #108] + ldr r12, [sp, #112] + ldr lr, [sp, #116] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #104] + ldr lr, [sp, #108] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #104] + str lr, [sp, #108] + # Round 14 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r0] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r6, r7, [sp, #112] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #112] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #40] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #16] + ldr lr, [r0, #20] + strd r6, r7, [r0, #40] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #16] + ldrd r4, r5, [r0, #24] + str r12, [r0, #8] + str lr, [r0, #12] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #8] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #8] + mov r8, r6 + mov r9, r7 + # Calc new W[14] + ldr r12, [sp, #96] + ldr lr, [sp, #100] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #112] + ldr lr, [sp, #116] + ldrd r6, r7, [sp, #56] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #112] + str lr, [sp, #116] + ldr r12, [sp, #120] + ldr lr, [sp, #124] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #112] + ldr lr, [sp, #116] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #112] + str lr, [sp, #116] + # Round 15 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r0, #56] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r6, r7, [sp, #120] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #120] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #32] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #8] + ldr lr, [r0, #12] + strd r6, r7, [r0, #32] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #8] + ldrd r4, r5, [r0, #16] + str r12, [r0] + str lr, [r0, #4] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0] + mov r8, r6 + mov r9, r7 + # Calc new W[15] + ldr r12, [sp, #104] + ldr lr, [sp, #108] + lsrs r4, r12, #19 + lsrs r5, lr, #19 + orr r5, r5, r12, lsl #13 + orr r4, r4, lr, lsl #13 + lsls r6, r12, #3 + lsls r7, lr, #3 + orr r7, r7, r12, lsr #29 + orr r6, r6, lr, lsr #29 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #6 + lsrs r7, lr, #6 + orr r6, r6, lr, lsl #26 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #120] + ldr lr, [sp, #124] + ldrd r6, r7, [sp, #64] + adds r12, r12, r4 + adc lr, lr, r5 + adds r12, r12, r6 + adc lr, lr, r7 + str r12, [sp, #120] + str lr, [sp, #124] + ldr r12, [sp] + ldr lr, [sp, #4] + lsrs r4, r12, #1 + lsrs r5, lr, #1 + orr r5, r5, r12, lsl #31 + orr r4, r4, lr, lsl #31 + lsrs r6, r12, #8 + lsrs r7, lr, #8 + orr r7, r7, r12, lsl #24 + orr r6, r6, lr, lsl #24 + eor r5, r5, r7 + eor r4, r4, r6 + lsrs r6, r12, #7 + lsrs r7, lr, #7 + orr r6, r6, lr, lsl #25 + eor r5, r5, r7 + eor r4, r4, r6 + ldr r12, [sp, #120] + ldr lr, [sp, #124] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [sp, #120] + str lr, [sp, #124] + add r3, r3, #0x80 + subs r10, r10, #1 + bne L_sha512_len_neon_start + # Round 0 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r6, r7, [sp] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #24] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0] + ldr lr, [r0, #4] + strd r6, r7, [r0, #24] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0] + ldrd r4, r5, [r0, #8] + str r12, [r0, #56] + str lr, [r0, #60] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #56] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #56] + mov r8, r6 + mov r9, r7 + # Round 1 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r0, #40] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r6, r7, [sp, #8] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #8] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #16] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #56] + ldr lr, [r0, #60] + strd r6, r7, [r0, #16] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #56] + ldrd r4, r5, [r0] + str r12, [r0, #48] + str lr, [r0, #52] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #48] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #48] + mov r8, r6 + mov r9, r7 + # Round 2 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r0, #32] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r6, r7, [sp, #16] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #16] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #8] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #48] + ldr lr, [r0, #52] + strd r6, r7, [r0, #8] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #48] + ldrd r4, r5, [r0, #56] + str r12, [r0, #40] + str lr, [r0, #44] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #40] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #40] + mov r8, r6 + mov r9, r7 + # Round 3 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r6, r7, [sp, #24] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #24] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #40] + ldr lr, [r0, #44] + strd r6, r7, [r0] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #40] + ldrd r4, r5, [r0, #48] + str r12, [r0, #32] + str lr, [r0, #36] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #32] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #32] + mov r8, r6 + mov r9, r7 + # Round 4 + ldr r12, [r0] + ldr lr, [r0, #4] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r6, r7, [sp, #32] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #32] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #56] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #32] + ldr lr, [r0, #36] + strd r6, r7, [r0, #56] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #32] + ldrd r4, r5, [r0, #40] + str r12, [r0, #24] + str lr, [r0, #28] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #24] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #24] + mov r8, r6 + mov r9, r7 + # Round 5 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r4, r5, [r0] + ldrd r6, r7, [r0, #8] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r6, r7, [sp, #40] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #40] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #48] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #24] + ldr lr, [r0, #28] + strd r6, r7, [r0, #48] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #24] + ldrd r4, r5, [r0, #32] + str r12, [r0, #16] + str lr, [r0, #20] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #16] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #16] + mov r8, r6 + mov r9, r7 + # Round 6 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r0] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r6, r7, [sp, #48] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #48] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #40] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #16] + ldr lr, [r0, #20] + strd r6, r7, [r0, #40] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #16] + ldrd r4, r5, [r0, #24] + str r12, [r0, #8] + str lr, [r0, #12] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #8] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #8] + mov r8, r6 + mov r9, r7 + # Round 7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r0, #56] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r6, r7, [sp, #56] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #56] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #32] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #8] + ldr lr, [r0, #12] + strd r6, r7, [r0, #32] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #8] + ldrd r4, r5, [r0, #16] + str r12, [r0] + str lr, [r0, #4] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0] + mov r8, r6 + mov r9, r7 + # Round 8 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r0, #48] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r6, r7, [sp, #64] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #64] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #24] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #56] + str lr, [r0, #60] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0] + ldr lr, [r0, #4] + strd r6, r7, [r0, #24] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0] + ldrd r4, r5, [r0, #8] + str r12, [r0, #56] + str lr, [r0, #60] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #56] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #56] + mov r8, r6 + mov r9, r7 + # Round 9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r0, #40] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r6, r7, [sp, #72] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #72] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #16] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #48] + str lr, [r0, #52] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #56] + ldr lr, [r0, #60] + strd r6, r7, [r0, #16] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #56] + ldrd r4, r5, [r0] + str r12, [r0, #48] + str lr, [r0, #52] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #48] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #48] + mov r8, r6 + mov r9, r7 + # Round 10 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r0, #32] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r6, r7, [sp, #80] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #80] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #8] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #40] + str lr, [r0, #44] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #48] + ldr lr, [r0, #52] + strd r6, r7, [r0, #8] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #48] + ldrd r4, r5, [r0, #56] + str r12, [r0, #40] + str lr, [r0, #44] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #40] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #40] + mov r8, r6 + mov r9, r7 + # Round 11 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r4, r5, [r0, #16] + ldrd r6, r7, [r0, #24] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r6, r7, [sp, #88] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #88] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #32] + str lr, [r0, #36] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #40] + ldr lr, [r0, #44] + strd r6, r7, [r0] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #32] + ldr lr, [r0, #36] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #40] + ldrd r4, r5, [r0, #48] + str r12, [r0, #32] + str lr, [r0, #36] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #32] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #32] + mov r8, r6 + mov r9, r7 + # Round 12 + ldr r12, [r0] + ldr lr, [r0, #4] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [r0, #16] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + ldrd r6, r7, [sp, #96] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #96] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #56] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #24] + str lr, [r0, #28] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #32] + ldr lr, [r0, #36] + strd r6, r7, [r0, #56] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #24] + ldr lr, [r0, #28] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #32] + ldrd r4, r5, [r0, #40] + str r12, [r0, #24] + str lr, [r0, #28] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #24] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #24] + mov r8, r6 + mov r9, r7 + # Round 13 + ldr r12, [r0, #56] + ldr lr, [r0, #60] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + ldr r12, [r0, #56] + ldr lr, [r0, #60] + ldrd r4, r5, [r0] + ldrd r6, r7, [r0, #8] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r6, r7, [sp, #104] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #104] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #48] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #16] + str lr, [r0, #20] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #24] + ldr lr, [r0, #28] + strd r6, r7, [r0, #48] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #16] + ldr lr, [r0, #20] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #24] + ldrd r4, r5, [r0, #32] + str r12, [r0, #16] + str lr, [r0, #20] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #16] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #16] + mov r8, r6 + mov r9, r7 + # Round 14 + ldr r12, [r0, #48] + ldr lr, [r0, #52] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r0] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + ldrd r6, r7, [sp, #112] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #112] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #40] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0, #8] + str lr, [r0, #12] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #16] + ldr lr, [r0, #20] + strd r6, r7, [r0, #40] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0, #8] + ldr lr, [r0, #12] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #16] + ldrd r4, r5, [r0, #24] + str r12, [r0, #8] + str lr, [r0, #12] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0, #8] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0, #8] + mov r8, r6 + mov r9, r7 + # Round 15 + ldr r12, [r0, #40] + ldr lr, [r0, #44] + lsrs r4, r12, #14 + lsrs r5, lr, #14 + orr r5, r5, r12, lsl #18 + orr r4, r4, lr, lsl #18 + lsrs r6, r12, #18 + lsrs r7, lr, #18 + orr r7, r7, r12, lsl #14 + orr r6, r6, lr, lsl #14 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #23 + lsls r7, lr, #23 + orr r7, r7, r12, lsr #9 + orr r6, r6, lr, lsr #9 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + ldr r12, [r0, #40] + ldr lr, [r0, #44] + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r0, #56] + eor r4, r4, r6 + eor r5, r5, r7 + and r4, r4, r12 + and r5, r5, lr + eor r4, r4, r6 + eor r5, r5, r7 + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r6, r7, [sp, #120] + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r4, r5, [r3, #120] + adds r12, r12, r6 + adc lr, lr, r7 + ldrd r6, r7, [r0, #32] + adds r12, r12, r4 + adc lr, lr, r5 + str r12, [r0] + str lr, [r0, #4] + adds r6, r6, r12 + adc r7, r7, lr + ldr r12, [r0, #8] + ldr lr, [r0, #12] + strd r6, r7, [r0, #32] + lsrs r4, r12, #28 + lsrs r5, lr, #28 + orr r5, r5, r12, lsl #4 + orr r4, r4, lr, lsl #4 + lsls r6, r12, #30 + lsls r7, lr, #30 + orr r7, r7, r12, lsr #2 + orr r6, r6, lr, lsr #2 + eor r4, r4, r6 + eor r5, r5, r7 + lsls r6, r12, #25 + lsls r7, lr, #25 + orr r7, r7, r12, lsr #7 + orr r6, r6, lr, lsr #7 + ldr r12, [r0] + ldr lr, [r0, #4] + eor r4, r4, r6 + eor r5, r5, r7 + adds r12, r12, r4 + adc lr, lr, r5 + ldrd r6, r7, [r0, #8] + ldrd r4, r5, [r0, #16] + str r12, [r0] + str lr, [r0, #4] + eor r6, r6, r4 + eor r7, r7, r5 + and r8, r8, r6 + and r9, r9, r7 + eor r8, r8, r4 + eor r9, r9, r5 + ldrd r4, r5, [r0] + adds r4, r4, r8 + adc r5, r5, r9 + strd r4, r5, [r0] + mov r8, r6 + mov r9, r7 + # Add in digest from start + ldr r12, [r0] + ldr lr, [r0, #4] + ldrd r4, r5, [r0, #8] + ldrd r6, r7, [sp, #128] + ldrd r8, r9, [sp, #136] + adds r12, r12, r6 + adc lr, lr, r7 + adds r4, r4, r8 + adc r5, r5, r9 + str r12, [r0] + str lr, [r0, #4] + strd r4, r5, [r0, #8] + str r12, [sp, #128] + str lr, [sp, #132] + strd r4, r5, [sp, #136] + ldr r12, [r0, #16] + ldr lr, [r0, #20] + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [sp, #144] + ldrd r8, r9, [sp, #152] + adds r12, r12, r6 + adc lr, lr, r7 + adds r4, r4, r8 + adc r5, r5, r9 + str r12, [r0, #16] + str lr, [r0, #20] + strd r4, r5, [r0, #24] + str r12, [sp, #144] + str lr, [sp, #148] + strd r4, r5, [sp, #152] + ldr r12, [r0, #32] + ldr lr, [r0, #36] + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [sp, #160] + ldrd r8, r9, [sp, #168] + adds r12, r12, r6 + adc lr, lr, r7 + adds r4, r4, r8 + adc r5, r5, r9 + str r12, [r0, #32] + str lr, [r0, #36] + strd r4, r5, [r0, #40] + str r12, [sp, #160] + str lr, [sp, #164] + strd r4, r5, [sp, #168] + ldr r12, [r0, #48] + ldr lr, [r0, #52] + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [sp, #176] + ldrd r8, r9, [sp, #184] + adds r12, r12, r6 + adc lr, lr, r7 + adds r4, r4, r8 + adc r5, r5, r9 + str r12, [r0, #48] + str lr, [r0, #52] + strd r4, r5, [r0, #56] + str r12, [sp, #176] + str lr, [sp, #180] + strd r4, r5, [sp, #184] + subs r2, r2, #0x80 + sub r3, r3, #0x200 + add r1, r1, #0x80 + bne L_sha512_len_neon_begin + eor r0, r0, r0 + add sp, sp, #0xc0 + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size Transform_Sha512_Len,.-Transform_Sha512_Len +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#ifndef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA512_transform_neon_len_k, %object + .size L_SHA512_transform_neon_len_k, 640 + .align 3 +L_SHA512_transform_neon_len_k: + .word 0xd728ae22 + .word 0x428a2f98 + .word 0x23ef65cd + .word 0x71374491 + .word 0xec4d3b2f + .word 0xb5c0fbcf + .word 0x8189dbbc + .word 0xe9b5dba5 + .word 0xf348b538 + .word 0x3956c25b + .word 0xb605d019 + .word 0x59f111f1 + .word 0xaf194f9b + .word 0x923f82a4 + .word 0xda6d8118 + .word 0xab1c5ed5 + .word 0xa3030242 + .word 0xd807aa98 + .word 0x45706fbe + .word 0x12835b01 + .word 0x4ee4b28c + .word 0x243185be + .word 0xd5ffb4e2 + .word 0x550c7dc3 + .word 0xf27b896f + .word 0x72be5d74 + .word 0x3b1696b1 + .word 0x80deb1fe + .word 0x25c71235 + .word 0x9bdc06a7 + .word 0xcf692694 + .word 0xc19bf174 + .word 0x9ef14ad2 + .word 0xe49b69c1 + .word 0x384f25e3 + .word 0xefbe4786 + .word 0x8b8cd5b5 + .word 0xfc19dc6 + .word 0x77ac9c65 + .word 0x240ca1cc + .word 0x592b0275 + .word 0x2de92c6f + .word 0x6ea6e483 + .word 0x4a7484aa + .word 0xbd41fbd4 + .word 0x5cb0a9dc + .word 0x831153b5 + .word 0x76f988da + .word 0xee66dfab + .word 0x983e5152 + .word 0x2db43210 + .word 0xa831c66d + .word 0x98fb213f + .word 0xb00327c8 + .word 0xbeef0ee4 + .word 0xbf597fc7 + .word 0x3da88fc2 + .word 0xc6e00bf3 + .word 0x930aa725 + .word 0xd5a79147 + .word 0xe003826f + .word 0x6ca6351 + .word 0xa0e6e70 + .word 0x14292967 + .word 0x46d22ffc + .word 0x27b70a85 + .word 0x5c26c926 + .word 0x2e1b2138 + .word 0x5ac42aed + .word 0x4d2c6dfc + .word 0x9d95b3df + .word 0x53380d13 + .word 0x8baf63de + .word 0x650a7354 + .word 0x3c77b2a8 + .word 0x766a0abb + .word 0x47edaee6 + .word 0x81c2c92e + .word 0x1482353b + .word 0x92722c85 + .word 0x4cf10364 + .word 0xa2bfe8a1 + .word 0xbc423001 + .word 0xa81a664b + .word 0xd0f89791 + .word 0xc24b8b70 + .word 0x654be30 + .word 0xc76c51a3 + .word 0xd6ef5218 + .word 0xd192e819 + .word 0x5565a910 + .word 0xd6990624 + .word 0x5771202a + .word 0xf40e3585 + .word 0x32bbd1b8 + .word 0x106aa070 + .word 0xb8d2d0c8 + .word 0x19a4c116 + .word 0x5141ab53 + .word 0x1e376c08 + .word 0xdf8eeb99 + .word 0x2748774c + .word 0xe19b48a8 + .word 0x34b0bcb5 + .word 0xc5c95a63 + .word 0x391c0cb3 + .word 0xe3418acb + .word 0x4ed8aa4a + .word 0x7763e373 + .word 0x5b9cca4f + .word 0xd6b2b8a3 + .word 0x682e6ff3 + .word 0x5defb2fc + .word 0x748f82ee + .word 0x43172f60 + .word 0x78a5636f + .word 0xa1f0ab72 + .word 0x84c87814 + .word 0x1a6439ec + .word 0x8cc70208 + .word 0x23631e28 + .word 0x90befffa + .word 0xde82bde9 + .word 0xa4506ceb + .word 0xb2c67915 + .word 0xbef9a3f7 + .word 0xe372532b + .word 0xc67178f2 + .word 0xea26619c + .word 0xca273ece + .word 0x21c0c207 + .word 0xd186b8c7 + .word 0xcde0eb1e + .word 0xeada7dd6 + .word 0xee6ed178 + .word 0xf57d4f7f + .word 0x72176fba + .word 0x6f067aa + .word 0xa2c898a6 + .word 0xa637dc5 + .word 0xbef90dae + .word 0x113f9804 + .word 0x131c471b + .word 0x1b710b35 + .word 0x23047d84 + .word 0x28db77f5 + .word 0x40c72493 + .word 0x32caab7b + .word 0x15c9bebc + .word 0x3c9ebe0a + .word 0x9c100d4c + .word 0x431d67c4 + .word 0xcb3e42b6 + .word 0x4cc5d4be + .word 0xfc657e2a + .word 0x597f299c + .word 0x3ad6faec + .word 0x5fcb6fab + .word 0x4a475817 + .word 0x6c44198c + .text + .align 2 + .globl Transform_Sha512_Len + .type Transform_Sha512_Len, %function +Transform_Sha512_Len: + vpush {d8-d15} + # Load digest into working vars + vldm.64 r0, {d0-d7} + # Start of loop processing a block +L_sha512_len_neon_begin: + # Load W + vldm.64 r1!, {d16-d31} + vrev64.8 q8, q8 + vrev64.8 q9, q9 + vrev64.8 q10, q10 + vrev64.8 q11, q11 + vrev64.8 q12, q12 + vrev64.8 q13, q13 + vrev64.8 q14, q14 + vrev64.8 q15, q15 + adr r3, L_SHA512_transform_neon_len_k + mov r12, #4 + # Start of 16 rounds +L_sha512_len_neon_start: + # Round 0 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d4, #50 + vsri.u64 d8, d4, #14 + vshl.u64 d9, d0, #36 + vsri.u64 d9, d0, #28 + vshl.u64 d10, d4, #46 + vsri.u64 d10, d4, #18 + vshl.u64 d11, d0, #30 + vsri.u64 d11, d0, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d4, #23 + vsri.u64 d10, d4, #41 + vshl.u64 d11, d0, #25 + vsri.u64 d11, d0, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d7, d8 + vadd.i64 d12, d16 + vmov d8, d4 + veor d10, d1, d2 + vadd.i64 d7, d12 + vbsl d8, d5, d6 + vbsl d10, d0, d2 + vadd.i64 d7, d8 + vadd.i64 d10, d9 + vadd.i64 d3, d7 + vadd.i64 d7, d10 + # Round 1 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d3, #50 + vsri.u64 d8, d3, #14 + vshl.u64 d9, d7, #36 + vsri.u64 d9, d7, #28 + vshl.u64 d10, d3, #46 + vsri.u64 d10, d3, #18 + vshl.u64 d11, d7, #30 + vsri.u64 d11, d7, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d3, #23 + vsri.u64 d10, d3, #41 + vshl.u64 d11, d7, #25 + vsri.u64 d11, d7, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d6, d8 + vadd.i64 d12, d17 + vmov d8, d3 + veor d10, d0, d1 + vadd.i64 d6, d12 + vbsl d8, d4, d5 + vbsl d10, d7, d1 + vadd.i64 d6, d8 + vadd.i64 d10, d9 + vadd.i64 d2, d6 + vadd.i64 d6, d10 + # Calc new W[0]-W[1] + vext.8 q6, q8, q9, #8 + vshl.u64 q4, q15, #45 + vsri.u64 q4, q15, #19 + vshl.u64 q5, q15, #3 + vsri.u64 q5, q15, #61 + veor q5, q4 + vshr.u64 q4, q15, #6 + veor q5, q4 + vadd.i64 q8, q5 + vext.8 q7, q12, q13, #8 + vadd.i64 q8, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q8, q5 + # Round 2 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d2, #50 + vsri.u64 d8, d2, #14 + vshl.u64 d9, d6, #36 + vsri.u64 d9, d6, #28 + vshl.u64 d10, d2, #46 + vsri.u64 d10, d2, #18 + vshl.u64 d11, d6, #30 + vsri.u64 d11, d6, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d2, #23 + vsri.u64 d10, d2, #41 + vshl.u64 d11, d6, #25 + vsri.u64 d11, d6, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d5, d8 + vadd.i64 d12, d18 + vmov d8, d2 + veor d10, d7, d0 + vadd.i64 d5, d12 + vbsl d8, d3, d4 + vbsl d10, d6, d0 + vadd.i64 d5, d8 + vadd.i64 d10, d9 + vadd.i64 d1, d5 + vadd.i64 d5, d10 + # Round 3 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d1, #50 + vsri.u64 d8, d1, #14 + vshl.u64 d9, d5, #36 + vsri.u64 d9, d5, #28 + vshl.u64 d10, d1, #46 + vsri.u64 d10, d1, #18 + vshl.u64 d11, d5, #30 + vsri.u64 d11, d5, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d1, #23 + vsri.u64 d10, d1, #41 + vshl.u64 d11, d5, #25 + vsri.u64 d11, d5, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d4, d8 + vadd.i64 d12, d19 + vmov d8, d1 + veor d10, d6, d7 + vadd.i64 d4, d12 + vbsl d8, d2, d3 + vbsl d10, d5, d7 + vadd.i64 d4, d8 + vadd.i64 d10, d9 + vadd.i64 d0, d4 + vadd.i64 d4, d10 + # Calc new W[2]-W[3] + vext.8 q6, q9, q10, #8 + vshl.u64 q4, q8, #45 + vsri.u64 q4, q8, #19 + vshl.u64 q5, q8, #3 + vsri.u64 q5, q8, #61 + veor q5, q4 + vshr.u64 q4, q8, #6 + veor q5, q4 + vadd.i64 q9, q5 + vext.8 q7, q13, q14, #8 + vadd.i64 q9, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q9, q5 + # Round 4 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d0, #50 + vsri.u64 d8, d0, #14 + vshl.u64 d9, d4, #36 + vsri.u64 d9, d4, #28 + vshl.u64 d10, d0, #46 + vsri.u64 d10, d0, #18 + vshl.u64 d11, d4, #30 + vsri.u64 d11, d4, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d0, #23 + vsri.u64 d10, d0, #41 + vshl.u64 d11, d4, #25 + vsri.u64 d11, d4, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d3, d8 + vadd.i64 d12, d20 + vmov d8, d0 + veor d10, d5, d6 + vadd.i64 d3, d12 + vbsl d8, d1, d2 + vbsl d10, d4, d6 + vadd.i64 d3, d8 + vadd.i64 d10, d9 + vadd.i64 d7, d3 + vadd.i64 d3, d10 + # Round 5 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d7, #50 + vsri.u64 d8, d7, #14 + vshl.u64 d9, d3, #36 + vsri.u64 d9, d3, #28 + vshl.u64 d10, d7, #46 + vsri.u64 d10, d7, #18 + vshl.u64 d11, d3, #30 + vsri.u64 d11, d3, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d7, #23 + vsri.u64 d10, d7, #41 + vshl.u64 d11, d3, #25 + vsri.u64 d11, d3, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d2, d8 + vadd.i64 d12, d21 + vmov d8, d7 + veor d10, d4, d5 + vadd.i64 d2, d12 + vbsl d8, d0, d1 + vbsl d10, d3, d5 + vadd.i64 d2, d8 + vadd.i64 d10, d9 + vadd.i64 d6, d2 + vadd.i64 d2, d10 + # Calc new W[4]-W[5] + vext.8 q6, q10, q11, #8 + vshl.u64 q4, q9, #45 + vsri.u64 q4, q9, #19 + vshl.u64 q5, q9, #3 + vsri.u64 q5, q9, #61 + veor q5, q4 + vshr.u64 q4, q9, #6 + veor q5, q4 + vadd.i64 q10, q5 + vext.8 q7, q14, q15, #8 + vadd.i64 q10, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q10, q5 + # Round 6 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d6, #50 + vsri.u64 d8, d6, #14 + vshl.u64 d9, d2, #36 + vsri.u64 d9, d2, #28 + vshl.u64 d10, d6, #46 + vsri.u64 d10, d6, #18 + vshl.u64 d11, d2, #30 + vsri.u64 d11, d2, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d6, #23 + vsri.u64 d10, d6, #41 + vshl.u64 d11, d2, #25 + vsri.u64 d11, d2, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d1, d8 + vadd.i64 d12, d22 + vmov d8, d6 + veor d10, d3, d4 + vadd.i64 d1, d12 + vbsl d8, d7, d0 + vbsl d10, d2, d4 + vadd.i64 d1, d8 + vadd.i64 d10, d9 + vadd.i64 d5, d1 + vadd.i64 d1, d10 + # Round 7 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d5, #50 + vsri.u64 d8, d5, #14 + vshl.u64 d9, d1, #36 + vsri.u64 d9, d1, #28 + vshl.u64 d10, d5, #46 + vsri.u64 d10, d5, #18 + vshl.u64 d11, d1, #30 + vsri.u64 d11, d1, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d5, #23 + vsri.u64 d10, d5, #41 + vshl.u64 d11, d1, #25 + vsri.u64 d11, d1, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d0, d8 + vadd.i64 d12, d23 + vmov d8, d5 + veor d10, d2, d3 + vadd.i64 d0, d12 + vbsl d8, d6, d7 + vbsl d10, d1, d3 + vadd.i64 d0, d8 + vadd.i64 d10, d9 + vadd.i64 d4, d0 + vadd.i64 d0, d10 + # Calc new W[6]-W[7] + vext.8 q6, q11, q12, #8 + vshl.u64 q4, q10, #45 + vsri.u64 q4, q10, #19 + vshl.u64 q5, q10, #3 + vsri.u64 q5, q10, #61 + veor q5, q4 + vshr.u64 q4, q10, #6 + veor q5, q4 + vadd.i64 q11, q5 + vext.8 q7, q15, q8, #8 + vadd.i64 q11, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q11, q5 + # Round 8 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d4, #50 + vsri.u64 d8, d4, #14 + vshl.u64 d9, d0, #36 + vsri.u64 d9, d0, #28 + vshl.u64 d10, d4, #46 + vsri.u64 d10, d4, #18 + vshl.u64 d11, d0, #30 + vsri.u64 d11, d0, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d4, #23 + vsri.u64 d10, d4, #41 + vshl.u64 d11, d0, #25 + vsri.u64 d11, d0, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d7, d8 + vadd.i64 d12, d24 + vmov d8, d4 + veor d10, d1, d2 + vadd.i64 d7, d12 + vbsl d8, d5, d6 + vbsl d10, d0, d2 + vadd.i64 d7, d8 + vadd.i64 d10, d9 + vadd.i64 d3, d7 + vadd.i64 d7, d10 + # Round 9 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d3, #50 + vsri.u64 d8, d3, #14 + vshl.u64 d9, d7, #36 + vsri.u64 d9, d7, #28 + vshl.u64 d10, d3, #46 + vsri.u64 d10, d3, #18 + vshl.u64 d11, d7, #30 + vsri.u64 d11, d7, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d3, #23 + vsri.u64 d10, d3, #41 + vshl.u64 d11, d7, #25 + vsri.u64 d11, d7, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d6, d8 + vadd.i64 d12, d25 + vmov d8, d3 + veor d10, d0, d1 + vadd.i64 d6, d12 + vbsl d8, d4, d5 + vbsl d10, d7, d1 + vadd.i64 d6, d8 + vadd.i64 d10, d9 + vadd.i64 d2, d6 + vadd.i64 d6, d10 + # Calc new W[8]-W[9] + vext.8 q6, q12, q13, #8 + vshl.u64 q4, q11, #45 + vsri.u64 q4, q11, #19 + vshl.u64 q5, q11, #3 + vsri.u64 q5, q11, #61 + veor q5, q4 + vshr.u64 q4, q11, #6 + veor q5, q4 + vadd.i64 q12, q5 + vext.8 q7, q8, q9, #8 + vadd.i64 q12, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q12, q5 + # Round 10 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d2, #50 + vsri.u64 d8, d2, #14 + vshl.u64 d9, d6, #36 + vsri.u64 d9, d6, #28 + vshl.u64 d10, d2, #46 + vsri.u64 d10, d2, #18 + vshl.u64 d11, d6, #30 + vsri.u64 d11, d6, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d2, #23 + vsri.u64 d10, d2, #41 + vshl.u64 d11, d6, #25 + vsri.u64 d11, d6, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d5, d8 + vadd.i64 d12, d26 + vmov d8, d2 + veor d10, d7, d0 + vadd.i64 d5, d12 + vbsl d8, d3, d4 + vbsl d10, d6, d0 + vadd.i64 d5, d8 + vadd.i64 d10, d9 + vadd.i64 d1, d5 + vadd.i64 d5, d10 + # Round 11 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d1, #50 + vsri.u64 d8, d1, #14 + vshl.u64 d9, d5, #36 + vsri.u64 d9, d5, #28 + vshl.u64 d10, d1, #46 + vsri.u64 d10, d1, #18 + vshl.u64 d11, d5, #30 + vsri.u64 d11, d5, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d1, #23 + vsri.u64 d10, d1, #41 + vshl.u64 d11, d5, #25 + vsri.u64 d11, d5, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d4, d8 + vadd.i64 d12, d27 + vmov d8, d1 + veor d10, d6, d7 + vadd.i64 d4, d12 + vbsl d8, d2, d3 + vbsl d10, d5, d7 + vadd.i64 d4, d8 + vadd.i64 d10, d9 + vadd.i64 d0, d4 + vadd.i64 d4, d10 + # Calc new W[10]-W[11] + vext.8 q6, q13, q14, #8 + vshl.u64 q4, q12, #45 + vsri.u64 q4, q12, #19 + vshl.u64 q5, q12, #3 + vsri.u64 q5, q12, #61 + veor q5, q4 + vshr.u64 q4, q12, #6 + veor q5, q4 + vadd.i64 q13, q5 + vext.8 q7, q9, q10, #8 + vadd.i64 q13, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q13, q5 + # Round 12 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d0, #50 + vsri.u64 d8, d0, #14 + vshl.u64 d9, d4, #36 + vsri.u64 d9, d4, #28 + vshl.u64 d10, d0, #46 + vsri.u64 d10, d0, #18 + vshl.u64 d11, d4, #30 + vsri.u64 d11, d4, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d0, #23 + vsri.u64 d10, d0, #41 + vshl.u64 d11, d4, #25 + vsri.u64 d11, d4, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d3, d8 + vadd.i64 d12, d28 + vmov d8, d0 + veor d10, d5, d6 + vadd.i64 d3, d12 + vbsl d8, d1, d2 + vbsl d10, d4, d6 + vadd.i64 d3, d8 + vadd.i64 d10, d9 + vadd.i64 d7, d3 + vadd.i64 d3, d10 + # Round 13 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d7, #50 + vsri.u64 d8, d7, #14 + vshl.u64 d9, d3, #36 + vsri.u64 d9, d3, #28 + vshl.u64 d10, d7, #46 + vsri.u64 d10, d7, #18 + vshl.u64 d11, d3, #30 + vsri.u64 d11, d3, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d7, #23 + vsri.u64 d10, d7, #41 + vshl.u64 d11, d3, #25 + vsri.u64 d11, d3, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d2, d8 + vadd.i64 d12, d29 + vmov d8, d7 + veor d10, d4, d5 + vadd.i64 d2, d12 + vbsl d8, d0, d1 + vbsl d10, d3, d5 + vadd.i64 d2, d8 + vadd.i64 d10, d9 + vadd.i64 d6, d2 + vadd.i64 d2, d10 + # Calc new W[12]-W[13] + vext.8 q6, q14, q15, #8 + vshl.u64 q4, q13, #45 + vsri.u64 q4, q13, #19 + vshl.u64 q5, q13, #3 + vsri.u64 q5, q13, #61 + veor q5, q4 + vshr.u64 q4, q13, #6 + veor q5, q4 + vadd.i64 q14, q5 + vext.8 q7, q10, q11, #8 + vadd.i64 q14, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q14, q5 + # Round 14 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d6, #50 + vsri.u64 d8, d6, #14 + vshl.u64 d9, d2, #36 + vsri.u64 d9, d2, #28 + vshl.u64 d10, d6, #46 + vsri.u64 d10, d6, #18 + vshl.u64 d11, d2, #30 + vsri.u64 d11, d2, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d6, #23 + vsri.u64 d10, d6, #41 + vshl.u64 d11, d2, #25 + vsri.u64 d11, d2, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d1, d8 + vadd.i64 d12, d30 + vmov d8, d6 + veor d10, d3, d4 + vadd.i64 d1, d12 + vbsl d8, d7, d0 + vbsl d10, d2, d4 + vadd.i64 d1, d8 + vadd.i64 d10, d9 + vadd.i64 d5, d1 + vadd.i64 d1, d10 + # Round 15 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d5, #50 + vsri.u64 d8, d5, #14 + vshl.u64 d9, d1, #36 + vsri.u64 d9, d1, #28 + vshl.u64 d10, d5, #46 + vsri.u64 d10, d5, #18 + vshl.u64 d11, d1, #30 + vsri.u64 d11, d1, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d5, #23 + vsri.u64 d10, d5, #41 + vshl.u64 d11, d1, #25 + vsri.u64 d11, d1, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d0, d8 + vadd.i64 d12, d31 + vmov d8, d5 + veor d10, d2, d3 + vadd.i64 d0, d12 + vbsl d8, d6, d7 + vbsl d10, d1, d3 + vadd.i64 d0, d8 + vadd.i64 d10, d9 + vadd.i64 d4, d0 + vadd.i64 d0, d10 + # Calc new W[14]-W[15] + vext.8 q6, q15, q8, #8 + vshl.u64 q4, q14, #45 + vsri.u64 q4, q14, #19 + vshl.u64 q5, q14, #3 + vsri.u64 q5, q14, #61 + veor q5, q4 + vshr.u64 q4, q14, #6 + veor q5, q4 + vadd.i64 q15, q5 + vext.8 q7, q11, q12, #8 + vadd.i64 q15, q7 + vshl.u64 q4, q6, #63 + vsri.u64 q4, q6, #1 + vshl.u64 q5, q6, #56 + vsri.u64 q5, q6, #8 + veor q5, q4 + vshr.u64 q6, #7 + veor q5, q6 + vadd.i64 q15, q5 + subs r12, r12, #1 + bne L_sha512_len_neon_start + # Round 0 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d4, #50 + vsri.u64 d8, d4, #14 + vshl.u64 d9, d0, #36 + vsri.u64 d9, d0, #28 + vshl.u64 d10, d4, #46 + vsri.u64 d10, d4, #18 + vshl.u64 d11, d0, #30 + vsri.u64 d11, d0, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d4, #23 + vsri.u64 d10, d4, #41 + vshl.u64 d11, d0, #25 + vsri.u64 d11, d0, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d7, d8 + vadd.i64 d12, d16 + vmov d8, d4 + veor d10, d1, d2 + vadd.i64 d7, d12 + vbsl d8, d5, d6 + vbsl d10, d0, d2 + vadd.i64 d7, d8 + vadd.i64 d10, d9 + vadd.i64 d3, d7 + vadd.i64 d7, d10 + # Round 1 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d3, #50 + vsri.u64 d8, d3, #14 + vshl.u64 d9, d7, #36 + vsri.u64 d9, d7, #28 + vshl.u64 d10, d3, #46 + vsri.u64 d10, d3, #18 + vshl.u64 d11, d7, #30 + vsri.u64 d11, d7, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d3, #23 + vsri.u64 d10, d3, #41 + vshl.u64 d11, d7, #25 + vsri.u64 d11, d7, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d6, d8 + vadd.i64 d12, d17 + vmov d8, d3 + veor d10, d0, d1 + vadd.i64 d6, d12 + vbsl d8, d4, d5 + vbsl d10, d7, d1 + vadd.i64 d6, d8 + vadd.i64 d10, d9 + vadd.i64 d2, d6 + vadd.i64 d6, d10 + # Round 2 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d2, #50 + vsri.u64 d8, d2, #14 + vshl.u64 d9, d6, #36 + vsri.u64 d9, d6, #28 + vshl.u64 d10, d2, #46 + vsri.u64 d10, d2, #18 + vshl.u64 d11, d6, #30 + vsri.u64 d11, d6, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d2, #23 + vsri.u64 d10, d2, #41 + vshl.u64 d11, d6, #25 + vsri.u64 d11, d6, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d5, d8 + vadd.i64 d12, d18 + vmov d8, d2 + veor d10, d7, d0 + vadd.i64 d5, d12 + vbsl d8, d3, d4 + vbsl d10, d6, d0 + vadd.i64 d5, d8 + vadd.i64 d10, d9 + vadd.i64 d1, d5 + vadd.i64 d5, d10 + # Round 3 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d1, #50 + vsri.u64 d8, d1, #14 + vshl.u64 d9, d5, #36 + vsri.u64 d9, d5, #28 + vshl.u64 d10, d1, #46 + vsri.u64 d10, d1, #18 + vshl.u64 d11, d5, #30 + vsri.u64 d11, d5, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d1, #23 + vsri.u64 d10, d1, #41 + vshl.u64 d11, d5, #25 + vsri.u64 d11, d5, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d4, d8 + vadd.i64 d12, d19 + vmov d8, d1 + veor d10, d6, d7 + vadd.i64 d4, d12 + vbsl d8, d2, d3 + vbsl d10, d5, d7 + vadd.i64 d4, d8 + vadd.i64 d10, d9 + vadd.i64 d0, d4 + vadd.i64 d4, d10 + # Round 4 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d0, #50 + vsri.u64 d8, d0, #14 + vshl.u64 d9, d4, #36 + vsri.u64 d9, d4, #28 + vshl.u64 d10, d0, #46 + vsri.u64 d10, d0, #18 + vshl.u64 d11, d4, #30 + vsri.u64 d11, d4, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d0, #23 + vsri.u64 d10, d0, #41 + vshl.u64 d11, d4, #25 + vsri.u64 d11, d4, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d3, d8 + vadd.i64 d12, d20 + vmov d8, d0 + veor d10, d5, d6 + vadd.i64 d3, d12 + vbsl d8, d1, d2 + vbsl d10, d4, d6 + vadd.i64 d3, d8 + vadd.i64 d10, d9 + vadd.i64 d7, d3 + vadd.i64 d3, d10 + # Round 5 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d7, #50 + vsri.u64 d8, d7, #14 + vshl.u64 d9, d3, #36 + vsri.u64 d9, d3, #28 + vshl.u64 d10, d7, #46 + vsri.u64 d10, d7, #18 + vshl.u64 d11, d3, #30 + vsri.u64 d11, d3, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d7, #23 + vsri.u64 d10, d7, #41 + vshl.u64 d11, d3, #25 + vsri.u64 d11, d3, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d2, d8 + vadd.i64 d12, d21 + vmov d8, d7 + veor d10, d4, d5 + vadd.i64 d2, d12 + vbsl d8, d0, d1 + vbsl d10, d3, d5 + vadd.i64 d2, d8 + vadd.i64 d10, d9 + vadd.i64 d6, d2 + vadd.i64 d2, d10 + # Round 6 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d6, #50 + vsri.u64 d8, d6, #14 + vshl.u64 d9, d2, #36 + vsri.u64 d9, d2, #28 + vshl.u64 d10, d6, #46 + vsri.u64 d10, d6, #18 + vshl.u64 d11, d2, #30 + vsri.u64 d11, d2, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d6, #23 + vsri.u64 d10, d6, #41 + vshl.u64 d11, d2, #25 + vsri.u64 d11, d2, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d1, d8 + vadd.i64 d12, d22 + vmov d8, d6 + veor d10, d3, d4 + vadd.i64 d1, d12 + vbsl d8, d7, d0 + vbsl d10, d2, d4 + vadd.i64 d1, d8 + vadd.i64 d10, d9 + vadd.i64 d5, d1 + vadd.i64 d1, d10 + # Round 7 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d5, #50 + vsri.u64 d8, d5, #14 + vshl.u64 d9, d1, #36 + vsri.u64 d9, d1, #28 + vshl.u64 d10, d5, #46 + vsri.u64 d10, d5, #18 + vshl.u64 d11, d1, #30 + vsri.u64 d11, d1, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d5, #23 + vsri.u64 d10, d5, #41 + vshl.u64 d11, d1, #25 + vsri.u64 d11, d1, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d0, d8 + vadd.i64 d12, d23 + vmov d8, d5 + veor d10, d2, d3 + vadd.i64 d0, d12 + vbsl d8, d6, d7 + vbsl d10, d1, d3 + vadd.i64 d0, d8 + vadd.i64 d10, d9 + vadd.i64 d4, d0 + vadd.i64 d0, d10 + # Round 8 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d4, #50 + vsri.u64 d8, d4, #14 + vshl.u64 d9, d0, #36 + vsri.u64 d9, d0, #28 + vshl.u64 d10, d4, #46 + vsri.u64 d10, d4, #18 + vshl.u64 d11, d0, #30 + vsri.u64 d11, d0, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d4, #23 + vsri.u64 d10, d4, #41 + vshl.u64 d11, d0, #25 + vsri.u64 d11, d0, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d7, d8 + vadd.i64 d12, d24 + vmov d8, d4 + veor d10, d1, d2 + vadd.i64 d7, d12 + vbsl d8, d5, d6 + vbsl d10, d0, d2 + vadd.i64 d7, d8 + vadd.i64 d10, d9 + vadd.i64 d3, d7 + vadd.i64 d7, d10 + # Round 9 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d3, #50 + vsri.u64 d8, d3, #14 + vshl.u64 d9, d7, #36 + vsri.u64 d9, d7, #28 + vshl.u64 d10, d3, #46 + vsri.u64 d10, d3, #18 + vshl.u64 d11, d7, #30 + vsri.u64 d11, d7, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d3, #23 + vsri.u64 d10, d3, #41 + vshl.u64 d11, d7, #25 + vsri.u64 d11, d7, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d6, d8 + vadd.i64 d12, d25 + vmov d8, d3 + veor d10, d0, d1 + vadd.i64 d6, d12 + vbsl d8, d4, d5 + vbsl d10, d7, d1 + vadd.i64 d6, d8 + vadd.i64 d10, d9 + vadd.i64 d2, d6 + vadd.i64 d6, d10 + # Round 10 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d2, #50 + vsri.u64 d8, d2, #14 + vshl.u64 d9, d6, #36 + vsri.u64 d9, d6, #28 + vshl.u64 d10, d2, #46 + vsri.u64 d10, d2, #18 + vshl.u64 d11, d6, #30 + vsri.u64 d11, d6, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d2, #23 + vsri.u64 d10, d2, #41 + vshl.u64 d11, d6, #25 + vsri.u64 d11, d6, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d5, d8 + vadd.i64 d12, d26 + vmov d8, d2 + veor d10, d7, d0 + vadd.i64 d5, d12 + vbsl d8, d3, d4 + vbsl d10, d6, d0 + vadd.i64 d5, d8 + vadd.i64 d10, d9 + vadd.i64 d1, d5 + vadd.i64 d5, d10 + # Round 11 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d1, #50 + vsri.u64 d8, d1, #14 + vshl.u64 d9, d5, #36 + vsri.u64 d9, d5, #28 + vshl.u64 d10, d1, #46 + vsri.u64 d10, d1, #18 + vshl.u64 d11, d5, #30 + vsri.u64 d11, d5, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d1, #23 + vsri.u64 d10, d1, #41 + vshl.u64 d11, d5, #25 + vsri.u64 d11, d5, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d4, d8 + vadd.i64 d12, d27 + vmov d8, d1 + veor d10, d6, d7 + vadd.i64 d4, d12 + vbsl d8, d2, d3 + vbsl d10, d5, d7 + vadd.i64 d4, d8 + vadd.i64 d10, d9 + vadd.i64 d0, d4 + vadd.i64 d4, d10 + # Round 12 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d0, #50 + vsri.u64 d8, d0, #14 + vshl.u64 d9, d4, #36 + vsri.u64 d9, d4, #28 + vshl.u64 d10, d0, #46 + vsri.u64 d10, d0, #18 + vshl.u64 d11, d4, #30 + vsri.u64 d11, d4, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d0, #23 + vsri.u64 d10, d0, #41 + vshl.u64 d11, d4, #25 + vsri.u64 d11, d4, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d3, d8 + vadd.i64 d12, d28 + vmov d8, d0 + veor d10, d5, d6 + vadd.i64 d3, d12 + vbsl d8, d1, d2 + vbsl d10, d4, d6 + vadd.i64 d3, d8 + vadd.i64 d10, d9 + vadd.i64 d7, d3 + vadd.i64 d3, d10 + # Round 13 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d7, #50 + vsri.u64 d8, d7, #14 + vshl.u64 d9, d3, #36 + vsri.u64 d9, d3, #28 + vshl.u64 d10, d7, #46 + vsri.u64 d10, d7, #18 + vshl.u64 d11, d3, #30 + vsri.u64 d11, d3, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d7, #23 + vsri.u64 d10, d7, #41 + vshl.u64 d11, d3, #25 + vsri.u64 d11, d3, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d2, d8 + vadd.i64 d12, d29 + vmov d8, d7 + veor d10, d4, d5 + vadd.i64 d2, d12 + vbsl d8, d0, d1 + vbsl d10, d3, d5 + vadd.i64 d2, d8 + vadd.i64 d10, d9 + vadd.i64 d6, d2 + vadd.i64 d2, d10 + # Round 14 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d6, #50 + vsri.u64 d8, d6, #14 + vshl.u64 d9, d2, #36 + vsri.u64 d9, d2, #28 + vshl.u64 d10, d6, #46 + vsri.u64 d10, d6, #18 + vshl.u64 d11, d2, #30 + vsri.u64 d11, d2, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d6, #23 + vsri.u64 d10, d6, #41 + vshl.u64 d11, d2, #25 + vsri.u64 d11, d2, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d1, d8 + vadd.i64 d12, d30 + vmov d8, d6 + veor d10, d3, d4 + vadd.i64 d1, d12 + vbsl d8, d7, d0 + vbsl d10, d2, d4 + vadd.i64 d1, d8 + vadd.i64 d10, d9 + vadd.i64 d5, d1 + vadd.i64 d1, d10 + # Round 15 + vld1.64 {d12}, [r3:64]! + vshl.u64 d8, d5, #50 + vsri.u64 d8, d5, #14 + vshl.u64 d9, d1, #36 + vsri.u64 d9, d1, #28 + vshl.u64 d10, d5, #46 + vsri.u64 d10, d5, #18 + vshl.u64 d11, d1, #30 + vsri.u64 d11, d1, #34 + veor d8, d10 + veor d9, d11 + vshl.u64 d10, d5, #23 + vsri.u64 d10, d5, #41 + vshl.u64 d11, d1, #25 + vsri.u64 d11, d1, #39 + veor d8, d10 + veor d9, d11 + vadd.i64 d0, d8 + vadd.i64 d12, d31 + vmov d8, d5 + veor d10, d2, d3 + vadd.i64 d0, d12 + vbsl d8, d6, d7 + vbsl d10, d1, d3 + vadd.i64 d0, d8 + vadd.i64 d10, d9 + vadd.i64 d4, d0 + vadd.i64 d0, d10 + # Add in digest from start + vldm.64 r0, {d8-d15} + vadd.i64 q0, q0, q4 + vadd.i64 q1, q1, q5 + vadd.i64 q2, q2, q6 + vadd.i64 q3, q3, q7 + vstm.64 r0, {d0-d7} + subs r2, r2, #0x80 + bne L_sha512_len_neon_begin + vpop {d8-d15} + bx lr + .size Transform_Sha512_Len,.-Transform_Sha512_Len +#endif /* !WOLFSSL_ARMASM_NO_NEON */ +#endif /* !__aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c new file mode 100644 index 000000000..889deda5c --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c @@ -0,0 +1,4773 @@ +/* armv8-32-sha512-asm + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-sha512-asm.c + */ +#ifndef __aarch64__ +#include +#include + +#ifdef WOLFSSL_ARMASM_NO_NEON +static const uint64_t L_SHA512_transform_len_k[] = { + 0x428a2f98d728ae22UL, + 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, + 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, + 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, + 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, + 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, + 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, + 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, + 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, + 0xefbe4786384f25e3UL, + 0xfc19dc68b8cd5b5UL, + 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, + 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, + 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, + 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, + 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, + 0xd5a79147930aa725UL, + 0x6ca6351e003826fUL, + 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, + 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, + 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, + 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, + 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, + 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, + 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, + 0xd69906245565a910UL, + 0xf40e35855771202aUL, + 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, + 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, + 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, + 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, + 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, + 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, + 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, + 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, + 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, + 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, + 0xf57d4f7fee6ed178UL, + 0x6f067aa72176fbaUL, + 0xa637dc5a2c898a6UL, + 0x113f9804bef90daeUL, + 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, + 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, + 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, + 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, + 0x6c44198c4a475817UL, +}; + +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + "sub sp, sp, #0xc0\n\t" + "mov r3, %[L_SHA512_transform_len_k]\n\t" + /* Copy digest to add in at end */ + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r8, r9, [%[sha512], #24]\n\t" + "strd r12, lr, [sp, #128]\n\t" + "strd r4, r5, [sp, #136]\n\t" + "strd r6, r7, [sp, #144]\n\t" + "strd r8, r9, [sp, #152]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r8, r9, [%[sha512], #56]\n\t" + "strd r12, lr, [sp, #160]\n\t" + "strd r4, r5, [sp, #168]\n\t" + "strd r6, r7, [sp, #176]\n\t" + "strd r8, r9, [sp, #184]\n\t" + /* Start of loop processing a block */ + "\n" + "L_sha512_len_neon_begin_%=: \n\t" + /* Load, Reverse and Store W */ + "ldrd r12, lr, [%[data]]\n\t" + "ldrd r4, r5, [%[data], #8]\n\t" + "ldrd r6, r7, [%[data], #16]\n\t" + "ldrd r8, r9, [%[data], #24]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "str lr, [sp]\n\t" + "str r12, [sp, #4]\n\t" + "str r5, [sp, #8]\n\t" + "str r4, [sp, #12]\n\t" + "str r7, [sp, #16]\n\t" + "str r6, [sp, #20]\n\t" + "str r9, [sp, #24]\n\t" + "str r8, [sp, #28]\n\t" + "ldrd r12, lr, [%[data], #32]\n\t" + "ldrd r4, r5, [%[data], #40]\n\t" + "ldrd r6, r7, [%[data], #48]\n\t" + "ldrd r8, r9, [%[data], #56]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "str lr, [sp, #32]\n\t" + "str r12, [sp, #36]\n\t" + "str r5, [sp, #40]\n\t" + "str r4, [sp, #44]\n\t" + "str r7, [sp, #48]\n\t" + "str r6, [sp, #52]\n\t" + "str r9, [sp, #56]\n\t" + "str r8, [sp, #60]\n\t" + "ldrd r12, lr, [%[data], #64]\n\t" + "ldrd r4, r5, [%[data], #72]\n\t" + "ldrd r6, r7, [%[data], #80]\n\t" + "ldrd r8, r9, [%[data], #88]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "str lr, [sp, #64]\n\t" + "str r12, [sp, #68]\n\t" + "str r5, [sp, #72]\n\t" + "str r4, [sp, #76]\n\t" + "str r7, [sp, #80]\n\t" + "str r6, [sp, #84]\n\t" + "str r9, [sp, #88]\n\t" + "str r8, [sp, #92]\n\t" + "ldrd r12, lr, [%[data], #96]\n\t" + "ldrd r4, r5, [%[data], #104]\n\t" + "ldrd r6, r7, [%[data], #112]\n\t" + "ldrd r8, r9, [%[data], #120]\n\t" + "rev r12, r12\n\t" + "rev lr, lr\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "str lr, [sp, #96]\n\t" + "str r12, [sp, #100]\n\t" + "str r5, [sp, #104]\n\t" + "str r4, [sp, #108]\n\t" + "str r7, [sp, #112]\n\t" + "str r6, [sp, #116]\n\t" + "str r9, [sp, #120]\n\t" + "str r8, [sp, #124]\n\t" + /* Pre-calc: b ^ c */ + "ldrd r8, r9, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r8, r8, r12\n\t" + "eor r9, r9, lr\n\t" + "mov r10, #4\n\t" + /* Start of 16 rounds */ + "\n" + "L_sha512_len_neon_start_%=: \n\t" + /* Round 0 */ + "ldrd r12, lr, [%[sha512], #32]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "strd r6, r7, [%[sha512], #24]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[0] */ + "ldrd r12, lr, [sp, #112]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp]\n\t" + "ldrd r12, lr, [sp, #8]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp]\n\t" + /* Round 1 */ + "ldrd r12, lr, [%[sha512], #24]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #8]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "strd r6, r7, [%[sha512], #16]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #48]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[1] */ + "ldrd r12, lr, [sp, #120]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #8]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #8]\n\t" + "ldrd r12, lr, [sp, #16]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #8]\n\t" + /* Round 2 */ + "ldrd r12, lr, [%[sha512], #16]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #16]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "strd r6, r7, [%[sha512], #8]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[2] */ + "ldrd r12, lr, [sp]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #16]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #16]\n\t" + "ldrd r12, lr, [sp, #24]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #16]\n\t" + /* Round 3 */ + "ldrd r12, lr, [%[sha512], #8]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #24]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "strd r6, r7, [%[sha512]]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #32]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[3] */ + "ldrd r12, lr, [sp, #8]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #24]\n\t" + "ldrd r6, r7, [sp, #96]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #24]\n\t" + "ldrd r12, lr, [sp, #32]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #24]\n\t" + /* Round 4 */ + "ldrd r12, lr, [%[sha512]]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #32]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "strd r6, r7, [%[sha512], #56]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[4] */ + "ldrd r12, lr, [sp, #16]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #32]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #32]\n\t" + "ldrd r12, lr, [sp, #40]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #32]\n\t" + /* Round 5 */ + "ldrd r12, lr, [%[sha512], #56]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #40]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "strd r6, r7, [%[sha512], #48]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #16]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[5] */ + "ldrd r12, lr, [sp, #24]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #40]\n\t" + "ldrd r6, r7, [sp, #112]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #40]\n\t" + "ldrd r12, lr, [sp, #48]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #40]\n\t" + /* Round 6 */ + "ldrd r12, lr, [%[sha512], #48]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #48]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "strd r6, r7, [%[sha512], #40]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[6] */ + "ldrd r12, lr, [sp, #32]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #48]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #48]\n\t" + "ldrd r12, lr, [sp, #56]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #48]\n\t" + /* Round 7 */ + "ldrd r12, lr, [%[sha512], #40]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #56]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "strd r6, r7, [%[sha512], #32]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512]]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[7] */ + "ldrd r12, lr, [sp, #40]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #56]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #56]\n\t" + "ldrd r12, lr, [sp, #64]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #56]\n\t" + /* Round 8 */ + "ldrd r12, lr, [%[sha512], #32]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #64]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "strd r6, r7, [%[sha512], #24]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[8] */ + "ldrd r12, lr, [sp, #48]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #64]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #64]\n\t" + "ldrd r12, lr, [sp, #72]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #64]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #64]\n\t" + /* Round 9 */ + "ldrd r12, lr, [%[sha512], #24]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #72]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "strd r6, r7, [%[sha512], #16]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #48]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[9] */ + "ldrd r12, lr, [sp, #56]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #72]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #72]\n\t" + "ldrd r12, lr, [sp, #80]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #72]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #72]\n\t" + /* Round 10 */ + "ldrd r12, lr, [%[sha512], #16]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #80]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "strd r6, r7, [%[sha512], #8]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[10] */ + "ldrd r12, lr, [sp, #64]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #80]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #80]\n\t" + "ldrd r12, lr, [sp, #88]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #80]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #80]\n\t" + /* Round 11 */ + "ldrd r12, lr, [%[sha512], #8]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #88]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "strd r6, r7, [%[sha512]]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #32]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[11] */ + "ldrd r12, lr, [sp, #72]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #88]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #88]\n\t" + "ldrd r12, lr, [sp, #96]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #88]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #88]\n\t" + /* Round 12 */ + "ldrd r12, lr, [%[sha512]]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #96]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #96]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "strd r6, r7, [%[sha512], #56]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[12] */ + "ldrd r12, lr, [sp, #80]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #96]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #96]\n\t" + "ldrd r12, lr, [sp, #104]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #96]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #96]\n\t" + /* Round 13 */ + "ldrd r12, lr, [%[sha512], #56]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #104]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "strd r6, r7, [%[sha512], #48]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #16]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[13] */ + "ldrd r12, lr, [sp, #88]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #104]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #104]\n\t" + "ldrd r12, lr, [sp, #112]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #104]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #104]\n\t" + /* Round 14 */ + "ldrd r12, lr, [%[sha512], #48]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #112]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #112]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "strd r6, r7, [%[sha512], #40]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[14] */ + "ldrd r12, lr, [sp, #96]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #112]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #112]\n\t" + "ldrd r12, lr, [sp, #120]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #112]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #112]\n\t" + /* Round 15 */ + "ldrd r12, lr, [%[sha512], #40]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #120]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "strd r6, r7, [%[sha512], #32]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512]]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Calc new W[15] */ + "ldrd r12, lr, [sp, #104]\n\t" + "lsrs r4, r12, #19\n\t" + "lsrs r5, lr, #19\n\t" + "orr r5, r5, r12, lsl 13\n\t" + "orr r4, r4, lr, lsl 13\n\t" + "lsls r6, r12, #3\n\t" + "lsls r7, lr, #3\n\t" + "orr r7, r7, r12, lsr 29\n\t" + "orr r6, r6, lr, lsr 29\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #6\n\t" + "lsrs r7, lr, #6\n\t" + "orr r6, r6, lr, lsl 26\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #120]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "strd r12, lr, [sp, #120]\n\t" + "ldrd r12, lr, [sp]\n\t" + "lsrs r4, r12, #1\n\t" + "lsrs r5, lr, #1\n\t" + "orr r5, r5, r12, lsl 31\n\t" + "orr r4, r4, lr, lsl 31\n\t" + "lsrs r6, r12, #8\n\t" + "lsrs r7, lr, #8\n\t" + "orr r7, r7, r12, lsl 24\n\t" + "orr r6, r6, lr, lsl 24\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "lsrs r6, r12, #7\n\t" + "lsrs r7, lr, #7\n\t" + "orr r6, r6, lr, lsl 25\n\t" + "eor r5, r5, r7\n\t" + "eor r4, r4, r6\n\t" + "ldrd r12, lr, [sp, #120]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [sp, #120]\n\t" + "add r3, r3, #0x80\n\t" + "subs r10, r10, #1\n\t" + "bne L_sha512_len_neon_start_%=\n\t" + /* Round 0 */ + "ldrd r12, lr, [%[sha512], #32]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "strd r6, r7, [%[sha512], #24]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 1 */ + "ldrd r12, lr, [%[sha512], #24]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r6, r7, [sp, #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #8]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "strd r6, r7, [%[sha512], #16]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #48]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 2 */ + "ldrd r12, lr, [%[sha512], #16]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #16]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "strd r6, r7, [%[sha512], #8]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 3 */ + "ldrd r12, lr, [%[sha512], #8]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r6, r7, [sp, #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #24]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "strd r6, r7, [%[sha512]]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #32]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 4 */ + "ldrd r12, lr, [%[sha512]]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #32]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "strd r6, r7, [%[sha512], #56]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 5 */ + "ldrd r12, lr, [%[sha512], #56]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #40]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "strd r6, r7, [%[sha512], #48]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #16]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 6 */ + "ldrd r12, lr, [%[sha512], #48]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #48]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "strd r6, r7, [%[sha512], #40]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 7 */ + "ldrd r12, lr, [%[sha512], #40]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #56]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "strd r6, r7, [%[sha512], #32]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512]]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 8 */ + "ldrd r12, lr, [%[sha512], #32]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #64]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "strd r6, r7, [%[sha512], #24]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [%[sha512], #56]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 9 */ + "ldrd r12, lr, [%[sha512], #24]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r6, r7, [sp, #72]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #72]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "strd r6, r7, [%[sha512], #16]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #48]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 10 */ + "ldrd r12, lr, [%[sha512], #16]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #80]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "strd r6, r7, [%[sha512], #8]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [%[sha512], #40]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 11 */ + "ldrd r12, lr, [%[sha512], #8]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #88]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "strd r6, r7, [%[sha512]]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #32]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 12 */ + "ldrd r12, lr, [%[sha512]]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #96]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #96]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "strd r6, r7, [%[sha512], #56]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [%[sha512], #24]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 13 */ + "ldrd r12, lr, [%[sha512], #56]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "ldrd r12, lr, [%[sha512], #56]\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r6, r7, [sp, #104]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #104]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #48]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #24]\n\t" + "strd r6, r7, [%[sha512], #48]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #24]\n\t" + "ldrd r4, r5, [%[sha512], #32]\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #16]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 14 */ + "ldrd r12, lr, [%[sha512], #48]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #112]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #112]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #40]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "strd r6, r7, [%[sha512], #40]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [%[sha512], #8]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Round 15 */ + "ldrd r12, lr, [%[sha512], #40]\n\t" + "lsrs r4, r12, #14\n\t" + "lsrs r5, lr, #14\n\t" + "orr r5, r5, r12, lsl 18\n\t" + "orr r4, r4, lr, lsl 18\n\t" + "lsrs r6, r12, #18\n\t" + "lsrs r7, lr, #18\n\t" + "orr r7, r7, r12, lsl 14\n\t" + "orr r6, r6, lr, lsl 14\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #23\n\t" + "lsls r7, lr, #23\n\t" + "orr r7, r7, r12, lsr 9\n\t" + "orr r6, r6, lr, lsr 9\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "ldrd r12, lr, [%[sha512], #40]\n\t" + "ldrd r4, r5, [%[sha512], #48]\n\t" + "ldrd r6, r7, [%[sha512], #56]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "and r4, r4, r12\n\t" + "and r5, r5, lr\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r6, r7, [sp, #120]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r4, r5, [r3, #120]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "ldrd r6, r7, [%[sha512], #32]\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + "ldrd r12, lr, [%[sha512], #8]\n\t" + "strd r6, r7, [%[sha512], #32]\n\t" + "lsrs r4, r12, #28\n\t" + "lsrs r5, lr, #28\n\t" + "orr r5, r5, r12, lsl 4\n\t" + "orr r4, r4, lr, lsl 4\n\t" + "lsls r6, r12, #30\n\t" + "lsls r7, lr, #30\n\t" + "orr r7, r7, r12, lsr 2\n\t" + "orr r6, r6, lr, lsr 2\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "lsls r6, r12, #25\n\t" + "lsls r7, lr, #25\n\t" + "orr r7, r7, r12, lsr 7\n\t" + "orr r6, r6, lr, lsr 7\n\t" + "ldrd r12, lr, [%[sha512]]\n\t" + "eor r4, r4, r6\n\t" + "eor r5, r5, r7\n\t" + "adds r12, r12, r4\n\t" + "adc lr, lr, r5\n\t" + "ldrd r6, r7, [%[sha512], #8]\n\t" + "ldrd r4, r5, [%[sha512], #16]\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "eor r6, r6, r4\n\t" + "eor r7, r7, r5\n\t" + "and r8, r8, r6\n\t" + "and r9, r9, r7\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "ldrd r4, r5, [%[sha512]]\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r4, r5, [%[sha512]]\n\t" + "mov r8, r6\n\t" + "mov r9, r7\n\t" + /* Add in digest from start */ + "ldrd r12, lr, [%[sha512]]\n\t" + "ldrd r4, r5, [%[sha512], #8]\n\t" + "ldrd r6, r7, [sp, #128]\n\t" + "ldrd r8, r9, [sp, #136]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r12, lr, [%[sha512]]\n\t" + "strd r4, r5, [%[sha512], #8]\n\t" + "strd r12, lr, [sp, #128]\n\t" + "strd r4, r5, [sp, #136]\n\t" + "ldrd r12, lr, [%[sha512], #16]\n\t" + "ldrd r4, r5, [%[sha512], #24]\n\t" + "ldrd r6, r7, [sp, #144]\n\t" + "ldrd r8, r9, [sp, #152]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r12, lr, [%[sha512], #16]\n\t" + "strd r4, r5, [%[sha512], #24]\n\t" + "strd r12, lr, [sp, #144]\n\t" + "strd r4, r5, [sp, #152]\n\t" + "ldrd r12, lr, [%[sha512], #32]\n\t" + "ldrd r4, r5, [%[sha512], #40]\n\t" + "ldrd r6, r7, [sp, #160]\n\t" + "ldrd r8, r9, [sp, #168]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r12, lr, [%[sha512], #32]\n\t" + "strd r4, r5, [%[sha512], #40]\n\t" + "strd r12, lr, [sp, #160]\n\t" + "strd r4, r5, [sp, #168]\n\t" + "ldrd r12, lr, [%[sha512], #48]\n\t" + "ldrd r4, r5, [%[sha512], #56]\n\t" + "ldrd r6, r7, [sp, #176]\n\t" + "ldrd r8, r9, [sp, #184]\n\t" + "adds r12, r12, r6\n\t" + "adc lr, lr, r7\n\t" + "adds r4, r4, r8\n\t" + "adc r5, r5, r9\n\t" + "strd r12, lr, [%[sha512], #48]\n\t" + "strd r4, r5, [%[sha512], #56]\n\t" + "strd r12, lr, [sp, #176]\n\t" + "strd r4, r5, [sp, #184]\n\t" + "subs %[len], %[len], #0x80\n\t" + "sub r3, r3, #0x200\n\t" + "add %[data], %[data], #0x80\n\t" + "bne L_sha512_len_neon_begin_%=\n\t" + "eor r0, r0, r0\n\t" + "add sp, sp, #0xc0\n\t" + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k) + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#include + +#ifndef WOLFSSL_ARMASM_NO_NEON +static const uint64_t L_SHA512_transform_neon_len_k[] = { + 0x428a2f98d728ae22UL, + 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, + 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, + 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, + 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, + 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, + 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, + 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, + 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, + 0xefbe4786384f25e3UL, + 0xfc19dc68b8cd5b5UL, + 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, + 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, + 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, + 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, + 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, + 0xd5a79147930aa725UL, + 0x6ca6351e003826fUL, + 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, + 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, + 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, + 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, + 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, + 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, + 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, + 0xd69906245565a910UL, + 0xf40e35855771202aUL, + 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, + 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, + 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, + 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, + 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, + 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, + 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, + 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, + 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, + 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, + 0xf57d4f7fee6ed178UL, + 0x6f067aa72176fbaUL, + 0xa637dc5a2c898a6UL, + 0x113f9804bef90daeUL, + 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, + 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, + 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, + 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, + 0x6c44198c4a475817UL, +}; + +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + /* Load digest into working vars */ + "vldm.64 %[sha512], {d0-d7}\n\t" + /* Start of loop processing a block */ + "\n" + "L_sha512_len_neon_begin_%=: \n\t" + /* Load W */ + "vldm.64 %[data]!, {d16-d31}\n\t" + "vrev64.8 q8, q8\n\t" + "vrev64.8 q9, q9\n\t" + "vrev64.8 q10, q10\n\t" + "vrev64.8 q11, q11\n\t" + "vrev64.8 q12, q12\n\t" + "vrev64.8 q13, q13\n\t" + "vrev64.8 q14, q14\n\t" + "vrev64.8 q15, q15\n\t" + "mov r3, %[L_SHA512_transform_neon_len_k]\n\t" + "mov r12, #4\n\t" + /* Start of 16 rounds */ + "\n" + "L_sha512_len_neon_start_%=: \n\t" + /* Round 0 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d4, #50\n\t" + "vsri.u64 d8, d4, #14\n\t" + "vshl.u64 d9, d0, #36\n\t" + "vsri.u64 d9, d0, #28\n\t" + "vshl.u64 d10, d4, #46\n\t" + "vsri.u64 d10, d4, #18\n\t" + "vshl.u64 d11, d0, #30\n\t" + "vsri.u64 d11, d0, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d4, #23\n\t" + "vsri.u64 d10, d4, #41\n\t" + "vshl.u64 d11, d0, #25\n\t" + "vsri.u64 d11, d0, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d12, d16\n\t" + "vmov d8, d4\n\t" + "veor d10, d1, d2\n\t" + "vadd.i64 d7, d12\n\t" + "vbsl d8, d5, d6\n\t" + "vbsl d10, d0, d2\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d3, d7\n\t" + "vadd.i64 d7, d10\n\t" + /* Round 1 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d3, #50\n\t" + "vsri.u64 d8, d3, #14\n\t" + "vshl.u64 d9, d7, #36\n\t" + "vsri.u64 d9, d7, #28\n\t" + "vshl.u64 d10, d3, #46\n\t" + "vsri.u64 d10, d3, #18\n\t" + "vshl.u64 d11, d7, #30\n\t" + "vsri.u64 d11, d7, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d3, #23\n\t" + "vsri.u64 d10, d3, #41\n\t" + "vshl.u64 d11, d7, #25\n\t" + "vsri.u64 d11, d7, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d12, d17\n\t" + "vmov d8, d3\n\t" + "veor d10, d0, d1\n\t" + "vadd.i64 d6, d12\n\t" + "vbsl d8, d4, d5\n\t" + "vbsl d10, d7, d1\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d2, d6\n\t" + "vadd.i64 d6, d10\n\t" + /* Calc new W[0]-W[1] */ + "vext.8 q6, q8, q9, #8\n\t" + "vshl.u64 q4, q15, #45\n\t" + "vsri.u64 q4, q15, #19\n\t" + "vshl.u64 q5, q15, #3\n\t" + "vsri.u64 q5, q15, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q15, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q8, q5\n\t" + "vext.8 q7, q12, q13, #8\n\t" + "vadd.i64 q8, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q8, q5\n\t" + /* Round 2 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d2, #50\n\t" + "vsri.u64 d8, d2, #14\n\t" + "vshl.u64 d9, d6, #36\n\t" + "vsri.u64 d9, d6, #28\n\t" + "vshl.u64 d10, d2, #46\n\t" + "vsri.u64 d10, d2, #18\n\t" + "vshl.u64 d11, d6, #30\n\t" + "vsri.u64 d11, d6, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d2, #23\n\t" + "vsri.u64 d10, d2, #41\n\t" + "vshl.u64 d11, d6, #25\n\t" + "vsri.u64 d11, d6, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d12, d18\n\t" + "vmov d8, d2\n\t" + "veor d10, d7, d0\n\t" + "vadd.i64 d5, d12\n\t" + "vbsl d8, d3, d4\n\t" + "vbsl d10, d6, d0\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d1, d5\n\t" + "vadd.i64 d5, d10\n\t" + /* Round 3 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d1, #50\n\t" + "vsri.u64 d8, d1, #14\n\t" + "vshl.u64 d9, d5, #36\n\t" + "vsri.u64 d9, d5, #28\n\t" + "vshl.u64 d10, d1, #46\n\t" + "vsri.u64 d10, d1, #18\n\t" + "vshl.u64 d11, d5, #30\n\t" + "vsri.u64 d11, d5, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d1, #23\n\t" + "vsri.u64 d10, d1, #41\n\t" + "vshl.u64 d11, d5, #25\n\t" + "vsri.u64 d11, d5, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d12, d19\n\t" + "vmov d8, d1\n\t" + "veor d10, d6, d7\n\t" + "vadd.i64 d4, d12\n\t" + "vbsl d8, d2, d3\n\t" + "vbsl d10, d5, d7\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d0, d4\n\t" + "vadd.i64 d4, d10\n\t" + /* Calc new W[2]-W[3] */ + "vext.8 q6, q9, q10, #8\n\t" + "vshl.u64 q4, q8, #45\n\t" + "vsri.u64 q4, q8, #19\n\t" + "vshl.u64 q5, q8, #3\n\t" + "vsri.u64 q5, q8, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q8, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q9, q5\n\t" + "vext.8 q7, q13, q14, #8\n\t" + "vadd.i64 q9, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q9, q5\n\t" + /* Round 4 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d0, #50\n\t" + "vsri.u64 d8, d0, #14\n\t" + "vshl.u64 d9, d4, #36\n\t" + "vsri.u64 d9, d4, #28\n\t" + "vshl.u64 d10, d0, #46\n\t" + "vsri.u64 d10, d0, #18\n\t" + "vshl.u64 d11, d4, #30\n\t" + "vsri.u64 d11, d4, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d0, #23\n\t" + "vsri.u64 d10, d0, #41\n\t" + "vshl.u64 d11, d4, #25\n\t" + "vsri.u64 d11, d4, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d12, d20\n\t" + "vmov d8, d0\n\t" + "veor d10, d5, d6\n\t" + "vadd.i64 d3, d12\n\t" + "vbsl d8, d1, d2\n\t" + "vbsl d10, d4, d6\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d7, d3\n\t" + "vadd.i64 d3, d10\n\t" + /* Round 5 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d7, #50\n\t" + "vsri.u64 d8, d7, #14\n\t" + "vshl.u64 d9, d3, #36\n\t" + "vsri.u64 d9, d3, #28\n\t" + "vshl.u64 d10, d7, #46\n\t" + "vsri.u64 d10, d7, #18\n\t" + "vshl.u64 d11, d3, #30\n\t" + "vsri.u64 d11, d3, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d7, #23\n\t" + "vsri.u64 d10, d7, #41\n\t" + "vshl.u64 d11, d3, #25\n\t" + "vsri.u64 d11, d3, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d12, d21\n\t" + "vmov d8, d7\n\t" + "veor d10, d4, d5\n\t" + "vadd.i64 d2, d12\n\t" + "vbsl d8, d0, d1\n\t" + "vbsl d10, d3, d5\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d6, d2\n\t" + "vadd.i64 d2, d10\n\t" + /* Calc new W[4]-W[5] */ + "vext.8 q6, q10, q11, #8\n\t" + "vshl.u64 q4, q9, #45\n\t" + "vsri.u64 q4, q9, #19\n\t" + "vshl.u64 q5, q9, #3\n\t" + "vsri.u64 q5, q9, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q9, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q10, q5\n\t" + "vext.8 q7, q14, q15, #8\n\t" + "vadd.i64 q10, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q10, q5\n\t" + /* Round 6 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d6, #50\n\t" + "vsri.u64 d8, d6, #14\n\t" + "vshl.u64 d9, d2, #36\n\t" + "vsri.u64 d9, d2, #28\n\t" + "vshl.u64 d10, d6, #46\n\t" + "vsri.u64 d10, d6, #18\n\t" + "vshl.u64 d11, d2, #30\n\t" + "vsri.u64 d11, d2, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d6, #23\n\t" + "vsri.u64 d10, d6, #41\n\t" + "vshl.u64 d11, d2, #25\n\t" + "vsri.u64 d11, d2, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d12, d22\n\t" + "vmov d8, d6\n\t" + "veor d10, d3, d4\n\t" + "vadd.i64 d1, d12\n\t" + "vbsl d8, d7, d0\n\t" + "vbsl d10, d2, d4\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d5, d1\n\t" + "vadd.i64 d1, d10\n\t" + /* Round 7 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d5, #50\n\t" + "vsri.u64 d8, d5, #14\n\t" + "vshl.u64 d9, d1, #36\n\t" + "vsri.u64 d9, d1, #28\n\t" + "vshl.u64 d10, d5, #46\n\t" + "vsri.u64 d10, d5, #18\n\t" + "vshl.u64 d11, d1, #30\n\t" + "vsri.u64 d11, d1, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d5, #23\n\t" + "vsri.u64 d10, d5, #41\n\t" + "vshl.u64 d11, d1, #25\n\t" + "vsri.u64 d11, d1, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d12, d23\n\t" + "vmov d8, d5\n\t" + "veor d10, d2, d3\n\t" + "vadd.i64 d0, d12\n\t" + "vbsl d8, d6, d7\n\t" + "vbsl d10, d1, d3\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d4, d0\n\t" + "vadd.i64 d0, d10\n\t" + /* Calc new W[6]-W[7] */ + "vext.8 q6, q11, q12, #8\n\t" + "vshl.u64 q4, q10, #45\n\t" + "vsri.u64 q4, q10, #19\n\t" + "vshl.u64 q5, q10, #3\n\t" + "vsri.u64 q5, q10, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q10, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q11, q5\n\t" + "vext.8 q7, q15, q8, #8\n\t" + "vadd.i64 q11, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q11, q5\n\t" + /* Round 8 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d4, #50\n\t" + "vsri.u64 d8, d4, #14\n\t" + "vshl.u64 d9, d0, #36\n\t" + "vsri.u64 d9, d0, #28\n\t" + "vshl.u64 d10, d4, #46\n\t" + "vsri.u64 d10, d4, #18\n\t" + "vshl.u64 d11, d0, #30\n\t" + "vsri.u64 d11, d0, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d4, #23\n\t" + "vsri.u64 d10, d4, #41\n\t" + "vshl.u64 d11, d0, #25\n\t" + "vsri.u64 d11, d0, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d12, d24\n\t" + "vmov d8, d4\n\t" + "veor d10, d1, d2\n\t" + "vadd.i64 d7, d12\n\t" + "vbsl d8, d5, d6\n\t" + "vbsl d10, d0, d2\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d3, d7\n\t" + "vadd.i64 d7, d10\n\t" + /* Round 9 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d3, #50\n\t" + "vsri.u64 d8, d3, #14\n\t" + "vshl.u64 d9, d7, #36\n\t" + "vsri.u64 d9, d7, #28\n\t" + "vshl.u64 d10, d3, #46\n\t" + "vsri.u64 d10, d3, #18\n\t" + "vshl.u64 d11, d7, #30\n\t" + "vsri.u64 d11, d7, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d3, #23\n\t" + "vsri.u64 d10, d3, #41\n\t" + "vshl.u64 d11, d7, #25\n\t" + "vsri.u64 d11, d7, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d12, d25\n\t" + "vmov d8, d3\n\t" + "veor d10, d0, d1\n\t" + "vadd.i64 d6, d12\n\t" + "vbsl d8, d4, d5\n\t" + "vbsl d10, d7, d1\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d2, d6\n\t" + "vadd.i64 d6, d10\n\t" + /* Calc new W[8]-W[9] */ + "vext.8 q6, q12, q13, #8\n\t" + "vshl.u64 q4, q11, #45\n\t" + "vsri.u64 q4, q11, #19\n\t" + "vshl.u64 q5, q11, #3\n\t" + "vsri.u64 q5, q11, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q11, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q12, q5\n\t" + "vext.8 q7, q8, q9, #8\n\t" + "vadd.i64 q12, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q12, q5\n\t" + /* Round 10 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d2, #50\n\t" + "vsri.u64 d8, d2, #14\n\t" + "vshl.u64 d9, d6, #36\n\t" + "vsri.u64 d9, d6, #28\n\t" + "vshl.u64 d10, d2, #46\n\t" + "vsri.u64 d10, d2, #18\n\t" + "vshl.u64 d11, d6, #30\n\t" + "vsri.u64 d11, d6, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d2, #23\n\t" + "vsri.u64 d10, d2, #41\n\t" + "vshl.u64 d11, d6, #25\n\t" + "vsri.u64 d11, d6, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d12, d26\n\t" + "vmov d8, d2\n\t" + "veor d10, d7, d0\n\t" + "vadd.i64 d5, d12\n\t" + "vbsl d8, d3, d4\n\t" + "vbsl d10, d6, d0\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d1, d5\n\t" + "vadd.i64 d5, d10\n\t" + /* Round 11 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d1, #50\n\t" + "vsri.u64 d8, d1, #14\n\t" + "vshl.u64 d9, d5, #36\n\t" + "vsri.u64 d9, d5, #28\n\t" + "vshl.u64 d10, d1, #46\n\t" + "vsri.u64 d10, d1, #18\n\t" + "vshl.u64 d11, d5, #30\n\t" + "vsri.u64 d11, d5, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d1, #23\n\t" + "vsri.u64 d10, d1, #41\n\t" + "vshl.u64 d11, d5, #25\n\t" + "vsri.u64 d11, d5, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d12, d27\n\t" + "vmov d8, d1\n\t" + "veor d10, d6, d7\n\t" + "vadd.i64 d4, d12\n\t" + "vbsl d8, d2, d3\n\t" + "vbsl d10, d5, d7\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d0, d4\n\t" + "vadd.i64 d4, d10\n\t" + /* Calc new W[10]-W[11] */ + "vext.8 q6, q13, q14, #8\n\t" + "vshl.u64 q4, q12, #45\n\t" + "vsri.u64 q4, q12, #19\n\t" + "vshl.u64 q5, q12, #3\n\t" + "vsri.u64 q5, q12, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q12, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q13, q5\n\t" + "vext.8 q7, q9, q10, #8\n\t" + "vadd.i64 q13, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q13, q5\n\t" + /* Round 12 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d0, #50\n\t" + "vsri.u64 d8, d0, #14\n\t" + "vshl.u64 d9, d4, #36\n\t" + "vsri.u64 d9, d4, #28\n\t" + "vshl.u64 d10, d0, #46\n\t" + "vsri.u64 d10, d0, #18\n\t" + "vshl.u64 d11, d4, #30\n\t" + "vsri.u64 d11, d4, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d0, #23\n\t" + "vsri.u64 d10, d0, #41\n\t" + "vshl.u64 d11, d4, #25\n\t" + "vsri.u64 d11, d4, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d12, d28\n\t" + "vmov d8, d0\n\t" + "veor d10, d5, d6\n\t" + "vadd.i64 d3, d12\n\t" + "vbsl d8, d1, d2\n\t" + "vbsl d10, d4, d6\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d7, d3\n\t" + "vadd.i64 d3, d10\n\t" + /* Round 13 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d7, #50\n\t" + "vsri.u64 d8, d7, #14\n\t" + "vshl.u64 d9, d3, #36\n\t" + "vsri.u64 d9, d3, #28\n\t" + "vshl.u64 d10, d7, #46\n\t" + "vsri.u64 d10, d7, #18\n\t" + "vshl.u64 d11, d3, #30\n\t" + "vsri.u64 d11, d3, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d7, #23\n\t" + "vsri.u64 d10, d7, #41\n\t" + "vshl.u64 d11, d3, #25\n\t" + "vsri.u64 d11, d3, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d12, d29\n\t" + "vmov d8, d7\n\t" + "veor d10, d4, d5\n\t" + "vadd.i64 d2, d12\n\t" + "vbsl d8, d0, d1\n\t" + "vbsl d10, d3, d5\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d6, d2\n\t" + "vadd.i64 d2, d10\n\t" + /* Calc new W[12]-W[13] */ + "vext.8 q6, q14, q15, #8\n\t" + "vshl.u64 q4, q13, #45\n\t" + "vsri.u64 q4, q13, #19\n\t" + "vshl.u64 q5, q13, #3\n\t" + "vsri.u64 q5, q13, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q13, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q14, q5\n\t" + "vext.8 q7, q10, q11, #8\n\t" + "vadd.i64 q14, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q14, q5\n\t" + /* Round 14 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d6, #50\n\t" + "vsri.u64 d8, d6, #14\n\t" + "vshl.u64 d9, d2, #36\n\t" + "vsri.u64 d9, d2, #28\n\t" + "vshl.u64 d10, d6, #46\n\t" + "vsri.u64 d10, d6, #18\n\t" + "vshl.u64 d11, d2, #30\n\t" + "vsri.u64 d11, d2, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d6, #23\n\t" + "vsri.u64 d10, d6, #41\n\t" + "vshl.u64 d11, d2, #25\n\t" + "vsri.u64 d11, d2, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d12, d30\n\t" + "vmov d8, d6\n\t" + "veor d10, d3, d4\n\t" + "vadd.i64 d1, d12\n\t" + "vbsl d8, d7, d0\n\t" + "vbsl d10, d2, d4\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d5, d1\n\t" + "vadd.i64 d1, d10\n\t" + /* Round 15 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d5, #50\n\t" + "vsri.u64 d8, d5, #14\n\t" + "vshl.u64 d9, d1, #36\n\t" + "vsri.u64 d9, d1, #28\n\t" + "vshl.u64 d10, d5, #46\n\t" + "vsri.u64 d10, d5, #18\n\t" + "vshl.u64 d11, d1, #30\n\t" + "vsri.u64 d11, d1, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d5, #23\n\t" + "vsri.u64 d10, d5, #41\n\t" + "vshl.u64 d11, d1, #25\n\t" + "vsri.u64 d11, d1, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d12, d31\n\t" + "vmov d8, d5\n\t" + "veor d10, d2, d3\n\t" + "vadd.i64 d0, d12\n\t" + "vbsl d8, d6, d7\n\t" + "vbsl d10, d1, d3\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d4, d0\n\t" + "vadd.i64 d0, d10\n\t" + /* Calc new W[14]-W[15] */ + "vext.8 q6, q15, q8, #8\n\t" + "vshl.u64 q4, q14, #45\n\t" + "vsri.u64 q4, q14, #19\n\t" + "vshl.u64 q5, q14, #3\n\t" + "vsri.u64 q5, q14, #61\n\t" + "veor q5, q4\n\t" + "vshr.u64 q4, q14, #6\n\t" + "veor q5, q4\n\t" + "vadd.i64 q15, q5\n\t" + "vext.8 q7, q11, q12, #8\n\t" + "vadd.i64 q15, q7\n\t" + "vshl.u64 q4, q6, #63\n\t" + "vsri.u64 q4, q6, #1\n\t" + "vshl.u64 q5, q6, #56\n\t" + "vsri.u64 q5, q6, #8\n\t" + "veor q5, q4\n\t" + "vshr.u64 q6, #7\n\t" + "veor q5, q6\n\t" + "vadd.i64 q15, q5\n\t" + "subs r12, r12, #1\n\t" + "bne L_sha512_len_neon_start_%=\n\t" + /* Round 0 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d4, #50\n\t" + "vsri.u64 d8, d4, #14\n\t" + "vshl.u64 d9, d0, #36\n\t" + "vsri.u64 d9, d0, #28\n\t" + "vshl.u64 d10, d4, #46\n\t" + "vsri.u64 d10, d4, #18\n\t" + "vshl.u64 d11, d0, #30\n\t" + "vsri.u64 d11, d0, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d4, #23\n\t" + "vsri.u64 d10, d4, #41\n\t" + "vshl.u64 d11, d0, #25\n\t" + "vsri.u64 d11, d0, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d12, d16\n\t" + "vmov d8, d4\n\t" + "veor d10, d1, d2\n\t" + "vadd.i64 d7, d12\n\t" + "vbsl d8, d5, d6\n\t" + "vbsl d10, d0, d2\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d3, d7\n\t" + "vadd.i64 d7, d10\n\t" + /* Round 1 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d3, #50\n\t" + "vsri.u64 d8, d3, #14\n\t" + "vshl.u64 d9, d7, #36\n\t" + "vsri.u64 d9, d7, #28\n\t" + "vshl.u64 d10, d3, #46\n\t" + "vsri.u64 d10, d3, #18\n\t" + "vshl.u64 d11, d7, #30\n\t" + "vsri.u64 d11, d7, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d3, #23\n\t" + "vsri.u64 d10, d3, #41\n\t" + "vshl.u64 d11, d7, #25\n\t" + "vsri.u64 d11, d7, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d12, d17\n\t" + "vmov d8, d3\n\t" + "veor d10, d0, d1\n\t" + "vadd.i64 d6, d12\n\t" + "vbsl d8, d4, d5\n\t" + "vbsl d10, d7, d1\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d2, d6\n\t" + "vadd.i64 d6, d10\n\t" + /* Round 2 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d2, #50\n\t" + "vsri.u64 d8, d2, #14\n\t" + "vshl.u64 d9, d6, #36\n\t" + "vsri.u64 d9, d6, #28\n\t" + "vshl.u64 d10, d2, #46\n\t" + "vsri.u64 d10, d2, #18\n\t" + "vshl.u64 d11, d6, #30\n\t" + "vsri.u64 d11, d6, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d2, #23\n\t" + "vsri.u64 d10, d2, #41\n\t" + "vshl.u64 d11, d6, #25\n\t" + "vsri.u64 d11, d6, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d12, d18\n\t" + "vmov d8, d2\n\t" + "veor d10, d7, d0\n\t" + "vadd.i64 d5, d12\n\t" + "vbsl d8, d3, d4\n\t" + "vbsl d10, d6, d0\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d1, d5\n\t" + "vadd.i64 d5, d10\n\t" + /* Round 3 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d1, #50\n\t" + "vsri.u64 d8, d1, #14\n\t" + "vshl.u64 d9, d5, #36\n\t" + "vsri.u64 d9, d5, #28\n\t" + "vshl.u64 d10, d1, #46\n\t" + "vsri.u64 d10, d1, #18\n\t" + "vshl.u64 d11, d5, #30\n\t" + "vsri.u64 d11, d5, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d1, #23\n\t" + "vsri.u64 d10, d1, #41\n\t" + "vshl.u64 d11, d5, #25\n\t" + "vsri.u64 d11, d5, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d12, d19\n\t" + "vmov d8, d1\n\t" + "veor d10, d6, d7\n\t" + "vadd.i64 d4, d12\n\t" + "vbsl d8, d2, d3\n\t" + "vbsl d10, d5, d7\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d0, d4\n\t" + "vadd.i64 d4, d10\n\t" + /* Round 4 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d0, #50\n\t" + "vsri.u64 d8, d0, #14\n\t" + "vshl.u64 d9, d4, #36\n\t" + "vsri.u64 d9, d4, #28\n\t" + "vshl.u64 d10, d0, #46\n\t" + "vsri.u64 d10, d0, #18\n\t" + "vshl.u64 d11, d4, #30\n\t" + "vsri.u64 d11, d4, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d0, #23\n\t" + "vsri.u64 d10, d0, #41\n\t" + "vshl.u64 d11, d4, #25\n\t" + "vsri.u64 d11, d4, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d12, d20\n\t" + "vmov d8, d0\n\t" + "veor d10, d5, d6\n\t" + "vadd.i64 d3, d12\n\t" + "vbsl d8, d1, d2\n\t" + "vbsl d10, d4, d6\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d7, d3\n\t" + "vadd.i64 d3, d10\n\t" + /* Round 5 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d7, #50\n\t" + "vsri.u64 d8, d7, #14\n\t" + "vshl.u64 d9, d3, #36\n\t" + "vsri.u64 d9, d3, #28\n\t" + "vshl.u64 d10, d7, #46\n\t" + "vsri.u64 d10, d7, #18\n\t" + "vshl.u64 d11, d3, #30\n\t" + "vsri.u64 d11, d3, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d7, #23\n\t" + "vsri.u64 d10, d7, #41\n\t" + "vshl.u64 d11, d3, #25\n\t" + "vsri.u64 d11, d3, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d12, d21\n\t" + "vmov d8, d7\n\t" + "veor d10, d4, d5\n\t" + "vadd.i64 d2, d12\n\t" + "vbsl d8, d0, d1\n\t" + "vbsl d10, d3, d5\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d6, d2\n\t" + "vadd.i64 d2, d10\n\t" + /* Round 6 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d6, #50\n\t" + "vsri.u64 d8, d6, #14\n\t" + "vshl.u64 d9, d2, #36\n\t" + "vsri.u64 d9, d2, #28\n\t" + "vshl.u64 d10, d6, #46\n\t" + "vsri.u64 d10, d6, #18\n\t" + "vshl.u64 d11, d2, #30\n\t" + "vsri.u64 d11, d2, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d6, #23\n\t" + "vsri.u64 d10, d6, #41\n\t" + "vshl.u64 d11, d2, #25\n\t" + "vsri.u64 d11, d2, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d12, d22\n\t" + "vmov d8, d6\n\t" + "veor d10, d3, d4\n\t" + "vadd.i64 d1, d12\n\t" + "vbsl d8, d7, d0\n\t" + "vbsl d10, d2, d4\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d5, d1\n\t" + "vadd.i64 d1, d10\n\t" + /* Round 7 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d5, #50\n\t" + "vsri.u64 d8, d5, #14\n\t" + "vshl.u64 d9, d1, #36\n\t" + "vsri.u64 d9, d1, #28\n\t" + "vshl.u64 d10, d5, #46\n\t" + "vsri.u64 d10, d5, #18\n\t" + "vshl.u64 d11, d1, #30\n\t" + "vsri.u64 d11, d1, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d5, #23\n\t" + "vsri.u64 d10, d5, #41\n\t" + "vshl.u64 d11, d1, #25\n\t" + "vsri.u64 d11, d1, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d12, d23\n\t" + "vmov d8, d5\n\t" + "veor d10, d2, d3\n\t" + "vadd.i64 d0, d12\n\t" + "vbsl d8, d6, d7\n\t" + "vbsl d10, d1, d3\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d4, d0\n\t" + "vadd.i64 d0, d10\n\t" + /* Round 8 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d4, #50\n\t" + "vsri.u64 d8, d4, #14\n\t" + "vshl.u64 d9, d0, #36\n\t" + "vsri.u64 d9, d0, #28\n\t" + "vshl.u64 d10, d4, #46\n\t" + "vsri.u64 d10, d4, #18\n\t" + "vshl.u64 d11, d0, #30\n\t" + "vsri.u64 d11, d0, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d4, #23\n\t" + "vsri.u64 d10, d4, #41\n\t" + "vshl.u64 d11, d0, #25\n\t" + "vsri.u64 d11, d0, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d12, d24\n\t" + "vmov d8, d4\n\t" + "veor d10, d1, d2\n\t" + "vadd.i64 d7, d12\n\t" + "vbsl d8, d5, d6\n\t" + "vbsl d10, d0, d2\n\t" + "vadd.i64 d7, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d3, d7\n\t" + "vadd.i64 d7, d10\n\t" + /* Round 9 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d3, #50\n\t" + "vsri.u64 d8, d3, #14\n\t" + "vshl.u64 d9, d7, #36\n\t" + "vsri.u64 d9, d7, #28\n\t" + "vshl.u64 d10, d3, #46\n\t" + "vsri.u64 d10, d3, #18\n\t" + "vshl.u64 d11, d7, #30\n\t" + "vsri.u64 d11, d7, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d3, #23\n\t" + "vsri.u64 d10, d3, #41\n\t" + "vshl.u64 d11, d7, #25\n\t" + "vsri.u64 d11, d7, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d12, d25\n\t" + "vmov d8, d3\n\t" + "veor d10, d0, d1\n\t" + "vadd.i64 d6, d12\n\t" + "vbsl d8, d4, d5\n\t" + "vbsl d10, d7, d1\n\t" + "vadd.i64 d6, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d2, d6\n\t" + "vadd.i64 d6, d10\n\t" + /* Round 10 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d2, #50\n\t" + "vsri.u64 d8, d2, #14\n\t" + "vshl.u64 d9, d6, #36\n\t" + "vsri.u64 d9, d6, #28\n\t" + "vshl.u64 d10, d2, #46\n\t" + "vsri.u64 d10, d2, #18\n\t" + "vshl.u64 d11, d6, #30\n\t" + "vsri.u64 d11, d6, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d2, #23\n\t" + "vsri.u64 d10, d2, #41\n\t" + "vshl.u64 d11, d6, #25\n\t" + "vsri.u64 d11, d6, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d12, d26\n\t" + "vmov d8, d2\n\t" + "veor d10, d7, d0\n\t" + "vadd.i64 d5, d12\n\t" + "vbsl d8, d3, d4\n\t" + "vbsl d10, d6, d0\n\t" + "vadd.i64 d5, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d1, d5\n\t" + "vadd.i64 d5, d10\n\t" + /* Round 11 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d1, #50\n\t" + "vsri.u64 d8, d1, #14\n\t" + "vshl.u64 d9, d5, #36\n\t" + "vsri.u64 d9, d5, #28\n\t" + "vshl.u64 d10, d1, #46\n\t" + "vsri.u64 d10, d1, #18\n\t" + "vshl.u64 d11, d5, #30\n\t" + "vsri.u64 d11, d5, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d1, #23\n\t" + "vsri.u64 d10, d1, #41\n\t" + "vshl.u64 d11, d5, #25\n\t" + "vsri.u64 d11, d5, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d12, d27\n\t" + "vmov d8, d1\n\t" + "veor d10, d6, d7\n\t" + "vadd.i64 d4, d12\n\t" + "vbsl d8, d2, d3\n\t" + "vbsl d10, d5, d7\n\t" + "vadd.i64 d4, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d0, d4\n\t" + "vadd.i64 d4, d10\n\t" + /* Round 12 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d0, #50\n\t" + "vsri.u64 d8, d0, #14\n\t" + "vshl.u64 d9, d4, #36\n\t" + "vsri.u64 d9, d4, #28\n\t" + "vshl.u64 d10, d0, #46\n\t" + "vsri.u64 d10, d0, #18\n\t" + "vshl.u64 d11, d4, #30\n\t" + "vsri.u64 d11, d4, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d0, #23\n\t" + "vsri.u64 d10, d0, #41\n\t" + "vshl.u64 d11, d4, #25\n\t" + "vsri.u64 d11, d4, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d12, d28\n\t" + "vmov d8, d0\n\t" + "veor d10, d5, d6\n\t" + "vadd.i64 d3, d12\n\t" + "vbsl d8, d1, d2\n\t" + "vbsl d10, d4, d6\n\t" + "vadd.i64 d3, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d7, d3\n\t" + "vadd.i64 d3, d10\n\t" + /* Round 13 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d7, #50\n\t" + "vsri.u64 d8, d7, #14\n\t" + "vshl.u64 d9, d3, #36\n\t" + "vsri.u64 d9, d3, #28\n\t" + "vshl.u64 d10, d7, #46\n\t" + "vsri.u64 d10, d7, #18\n\t" + "vshl.u64 d11, d3, #30\n\t" + "vsri.u64 d11, d3, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d7, #23\n\t" + "vsri.u64 d10, d7, #41\n\t" + "vshl.u64 d11, d3, #25\n\t" + "vsri.u64 d11, d3, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d12, d29\n\t" + "vmov d8, d7\n\t" + "veor d10, d4, d5\n\t" + "vadd.i64 d2, d12\n\t" + "vbsl d8, d0, d1\n\t" + "vbsl d10, d3, d5\n\t" + "vadd.i64 d2, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d6, d2\n\t" + "vadd.i64 d2, d10\n\t" + /* Round 14 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d6, #50\n\t" + "vsri.u64 d8, d6, #14\n\t" + "vshl.u64 d9, d2, #36\n\t" + "vsri.u64 d9, d2, #28\n\t" + "vshl.u64 d10, d6, #46\n\t" + "vsri.u64 d10, d6, #18\n\t" + "vshl.u64 d11, d2, #30\n\t" + "vsri.u64 d11, d2, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d6, #23\n\t" + "vsri.u64 d10, d6, #41\n\t" + "vshl.u64 d11, d2, #25\n\t" + "vsri.u64 d11, d2, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d12, d30\n\t" + "vmov d8, d6\n\t" + "veor d10, d3, d4\n\t" + "vadd.i64 d1, d12\n\t" + "vbsl d8, d7, d0\n\t" + "vbsl d10, d2, d4\n\t" + "vadd.i64 d1, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d5, d1\n\t" + "vadd.i64 d1, d10\n\t" + /* Round 15 */ + "vld1.64 {d12}, [r3]!\n\t" + "vshl.u64 d8, d5, #50\n\t" + "vsri.u64 d8, d5, #14\n\t" + "vshl.u64 d9, d1, #36\n\t" + "vsri.u64 d9, d1, #28\n\t" + "vshl.u64 d10, d5, #46\n\t" + "vsri.u64 d10, d5, #18\n\t" + "vshl.u64 d11, d1, #30\n\t" + "vsri.u64 d11, d1, #34\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vshl.u64 d10, d5, #23\n\t" + "vsri.u64 d10, d5, #41\n\t" + "vshl.u64 d11, d1, #25\n\t" + "vsri.u64 d11, d1, #39\n\t" + "veor d8, d10\n\t" + "veor d9, d11\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d12, d31\n\t" + "vmov d8, d5\n\t" + "veor d10, d2, d3\n\t" + "vadd.i64 d0, d12\n\t" + "vbsl d8, d6, d7\n\t" + "vbsl d10, d1, d3\n\t" + "vadd.i64 d0, d8\n\t" + "vadd.i64 d10, d9\n\t" + "vadd.i64 d4, d0\n\t" + "vadd.i64 d0, d10\n\t" + /* Add in digest from start */ + "vldm.64 %[sha512], {d8-d15}\n\t" + "vadd.i64 q0, q0, q4\n\t" + "vadd.i64 q1, q1, q5\n\t" + "vadd.i64 q2, q2, q6\n\t" + "vadd.i64 q3, q3, q7\n\t" + "vstm.64 %[sha512], {d0-d7}\n\t" + "subs %[len], %[len], #0x80\n\t" + "bne L_sha512_len_neon_begin_%=\n\t" + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k), [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k) + : "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#endif /* !WOLFSSL_ARMASM_NO_NEON */ +#endif /* !__aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S index a8cf8e742..7b733929f 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -122,20 +122,19 @@ L_SHA512_transform_neon_len_ror8: .globl Transform_Sha512_Len .type Transform_Sha512_Len, %function Transform_Sha512_Len: - stp x29, x30, [sp, #-144]! + stp x29, x30, [sp, #-128]! add x29, sp, #0 str x17, [x29, #16] stp x18, x19, [x29, #24] stp x20, x21, [x29, #40] stp x22, x23, [x29, #56] stp x24, x25, [x29, #72] - stp x26, x27, [x29, #88] - str x28, [x29, #104] - stp d8, d9, [x29, #112] - stp d10, d11, [x29, #128] + str x26, [x29, #88] + stp d8, d9, [x29, #96] + stp d10, d11, [x29, #112] adr x3, L_SHA512_transform_neon_len_k - adr x28, L_SHA512_transform_neon_len_ror8 - ld1 {v11.16b}, [x28] + adr x26, L_SHA512_transform_neon_len_ror8 + ld1 {v11.16b}, [x26] # Load digest into working vars ldp x4, x5, [x0] ldp x6, x7, [x0, #16] @@ -146,53 +145,53 @@ L_sha512_len_neon_begin: # Load W # Copy digest to add in at end ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x1], #0x40 - mov x20, x4 + mov x18, x4 ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x1], #0x40 - mov x21, x5 + mov x19, x5 rev64 v0.16b, v0.16b - mov x22, x6 + mov x20, x6 rev64 v1.16b, v1.16b - mov x23, x7 + mov x21, x7 rev64 v2.16b, v2.16b - mov x24, x8 + mov x22, x8 rev64 v3.16b, v3.16b - mov x25, x9 + mov x23, x9 rev64 v4.16b, v4.16b - mov x26, x10 + mov x24, x10 rev64 v5.16b, v5.16b - mov x27, x11 + mov x25, x11 rev64 v6.16b, v6.16b rev64 v7.16b, v7.16b # Pre-calc: b ^ c eor x16, x5, x6 - mov x28, #4 + mov x26, #4 # Start of 16 rounds L_sha512_len_neon_start: # Round 0 - mov x18, v0.d[0] - ldr x19, [x3], #8 + mov x13, v0.d[0] + ldr x15, [x3], #8 ror x12, x8, #14 ror x14, x4, #28 eor x12, x12, x8, ror 18 eor x14, x14, x4, ror 34 eor x12, x12, x8, ror 41 - eor x15, x14, x4, ror 39 + eor x14, x14, x4, ror 39 add x11, x11, x12 eor x17, x4, x5 eor x12, x9, x10 and x16, x17, x16 and x12, x12, x8 - add x11, x11, x18 + add x11, x11, x13 eor x12, x12, x10 - add x11, x11, x19 + add x11, x11, x15 eor x16, x16, x5 add x11, x11, x12 - add x15, x15, x16 + add x14, x14, x16 add x7, x7, x11 - add x11, x11, x15 + add x11, x11, x14 # Round 1 - mov x18, v0.d[1] - ldr x19, [x3], #8 + mov x13, v0.d[1] + ldr x15, [x3], #8 ext v10.16b, v0.16b, v1.16b, #8 ror x12, x7, #14 shl v8.2d, v7.2d, #45 @@ -204,7 +203,7 @@ L_sha512_len_neon_start: sri v9.2d, v7.2d, #61 eor x12, x12, x7, ror 41 eor v9.16b, v9.16b, v8.16b - eor x15, x14, x11, ror 39 + eor x14, x14, x11, ror 39 ushr v8.2d, v7.2d, #6 add x10, x10, x12 eor v9.16b, v9.16b, v8.16b @@ -216,45 +215,45 @@ L_sha512_len_neon_start: add v0.2d, v0.2d, v9.2d and x12, x12, x7 shl v8.2d, v10.2d, #63 - add x10, x10, x18 + add x10, x10, x13 sri v8.2d, v10.2d, #1 eor x12, x12, x9 tbl v9.16b, {v10.16b}, v11.16b - add x10, x10, x19 + add x10, x10, x15 eor v9.16b, v9.16b, v8.16b eor x17, x17, x4 ushr v10.2d, v10.2d, #7 add x10, x10, x12 eor v9.16b, v9.16b, v10.16b - add x15, x15, x17 + add x14, x14, x17 add v0.2d, v0.2d, v9.2d add x6, x6, x10 - add x10, x10, x15 + add x10, x10, x14 # Round 2 - mov x18, v1.d[0] - ldr x19, [x3], #8 + mov x13, v1.d[0] + ldr x15, [x3], #8 ror x12, x6, #14 ror x14, x10, #28 eor x12, x12, x6, ror 18 eor x14, x14, x10, ror 34 eor x12, x12, x6, ror 41 - eor x15, x14, x10, ror 39 + eor x14, x14, x10, ror 39 add x9, x9, x12 eor x17, x10, x11 eor x12, x7, x8 and x16, x17, x16 and x12, x12, x6 - add x9, x9, x18 + add x9, x9, x13 eor x12, x12, x8 - add x9, x9, x19 + add x9, x9, x15 eor x16, x16, x11 add x9, x9, x12 - add x15, x15, x16 + add x14, x14, x16 add x5, x5, x9 - add x9, x9, x15 + add x9, x9, x14 # Round 3 - mov x18, v1.d[1] - ldr x19, [x3], #8 + mov x13, v1.d[1] + ldr x15, [x3], #8 ext v10.16b, v1.16b, v2.16b, #8 ror x12, x5, #14 shl v8.2d, v0.2d, #45 @@ -266,7 +265,7 @@ L_sha512_len_neon_start: sri v9.2d, v0.2d, #61 eor x12, x12, x5, ror 41 eor v9.16b, v9.16b, v8.16b - eor x15, x14, x9, ror 39 + eor x14, x14, x9, ror 39 ushr v8.2d, v0.2d, #6 add x8, x8, x12 eor v9.16b, v9.16b, v8.16b @@ -278,45 +277,45 @@ L_sha512_len_neon_start: add v1.2d, v1.2d, v9.2d and x12, x12, x5 shl v8.2d, v10.2d, #63 - add x8, x8, x18 + add x8, x8, x13 sri v8.2d, v10.2d, #1 eor x12, x12, x7 tbl v9.16b, {v10.16b}, v11.16b - add x8, x8, x19 + add x8, x8, x15 eor v9.16b, v9.16b, v8.16b eor x17, x17, x10 ushr v10.2d, v10.2d, #7 add x8, x8, x12 eor v9.16b, v9.16b, v10.16b - add x15, x15, x17 + add x14, x14, x17 add v1.2d, v1.2d, v9.2d add x4, x4, x8 - add x8, x8, x15 + add x8, x8, x14 # Round 4 - mov x18, v2.d[0] - ldr x19, [x3], #8 + mov x13, v2.d[0] + ldr x15, [x3], #8 ror x12, x4, #14 ror x14, x8, #28 eor x12, x12, x4, ror 18 eor x14, x14, x8, ror 34 eor x12, x12, x4, ror 41 - eor x15, x14, x8, ror 39 + eor x14, x14, x8, ror 39 add x7, x7, x12 eor x17, x8, x9 eor x12, x5, x6 and x16, x17, x16 and x12, x12, x4 - add x7, x7, x18 + add x7, x7, x13 eor x12, x12, x6 - add x7, x7, x19 + add x7, x7, x15 eor x16, x16, x9 add x7, x7, x12 - add x15, x15, x16 + add x14, x14, x16 add x11, x11, x7 - add x7, x7, x15 + add x7, x7, x14 # Round 5 - mov x18, v2.d[1] - ldr x19, [x3], #8 + mov x13, v2.d[1] + ldr x15, [x3], #8 ext v10.16b, v2.16b, v3.16b, #8 ror x12, x11, #14 shl v8.2d, v1.2d, #45 @@ -328,7 +327,7 @@ L_sha512_len_neon_start: sri v9.2d, v1.2d, #61 eor x12, x12, x11, ror 41 eor v9.16b, v9.16b, v8.16b - eor x15, x14, x7, ror 39 + eor x14, x14, x7, ror 39 ushr v8.2d, v1.2d, #6 add x6, x6, x12 eor v9.16b, v9.16b, v8.16b @@ -340,45 +339,45 @@ L_sha512_len_neon_start: add v2.2d, v2.2d, v9.2d and x12, x12, x11 shl v8.2d, v10.2d, #63 - add x6, x6, x18 + add x6, x6, x13 sri v8.2d, v10.2d, #1 eor x12, x12, x5 tbl v9.16b, {v10.16b}, v11.16b - add x6, x6, x19 + add x6, x6, x15 eor v9.16b, v9.16b, v8.16b eor x17, x17, x8 ushr v10.2d, v10.2d, #7 add x6, x6, x12 eor v9.16b, v9.16b, v10.16b - add x15, x15, x17 + add x14, x14, x17 add v2.2d, v2.2d, v9.2d add x10, x10, x6 - add x6, x6, x15 + add x6, x6, x14 # Round 6 - mov x18, v3.d[0] - ldr x19, [x3], #8 + mov x13, v3.d[0] + ldr x15, [x3], #8 ror x12, x10, #14 ror x14, x6, #28 eor x12, x12, x10, ror 18 eor x14, x14, x6, ror 34 eor x12, x12, x10, ror 41 - eor x15, x14, x6, ror 39 + eor x14, x14, x6, ror 39 add x5, x5, x12 eor x17, x6, x7 eor x12, x11, x4 and x16, x17, x16 and x12, x12, x10 - add x5, x5, x18 + add x5, x5, x13 eor x12, x12, x4 - add x5, x5, x19 + add x5, x5, x15 eor x16, x16, x7 add x5, x5, x12 - add x15, x15, x16 + add x14, x14, x16 add x9, x9, x5 - add x5, x5, x15 + add x5, x5, x14 # Round 7 - mov x18, v3.d[1] - ldr x19, [x3], #8 + mov x13, v3.d[1] + ldr x15, [x3], #8 ext v10.16b, v3.16b, v4.16b, #8 ror x12, x9, #14 shl v8.2d, v2.2d, #45 @@ -390,7 +389,7 @@ L_sha512_len_neon_start: sri v9.2d, v2.2d, #61 eor x12, x12, x9, ror 41 eor v9.16b, v9.16b, v8.16b - eor x15, x14, x5, ror 39 + eor x14, x14, x5, ror 39 ushr v8.2d, v2.2d, #6 add x4, x4, x12 eor v9.16b, v9.16b, v8.16b @@ -402,45 +401,45 @@ L_sha512_len_neon_start: add v3.2d, v3.2d, v9.2d and x12, x12, x9 shl v8.2d, v10.2d, #63 - add x4, x4, x18 + add x4, x4, x13 sri v8.2d, v10.2d, #1 eor x12, x12, x11 tbl v9.16b, {v10.16b}, v11.16b - add x4, x4, x19 + add x4, x4, x15 eor v9.16b, v9.16b, v8.16b eor x17, x17, x6 ushr v10.2d, v10.2d, #7 add x4, x4, x12 eor v9.16b, v9.16b, v10.16b - add x15, x15, x17 + add x14, x14, x17 add v3.2d, v3.2d, v9.2d add x8, x8, x4 - add x4, x4, x15 + add x4, x4, x14 # Round 8 - mov x18, v4.d[0] - ldr x19, [x3], #8 + mov x13, v4.d[0] + ldr x15, [x3], #8 ror x12, x8, #14 ror x14, x4, #28 eor x12, x12, x8, ror 18 eor x14, x14, x4, ror 34 eor x12, x12, x8, ror 41 - eor x15, x14, x4, ror 39 + eor x14, x14, x4, ror 39 add x11, x11, x12 eor x17, x4, x5 eor x12, x9, x10 and x16, x17, x16 and x12, x12, x8 - add x11, x11, x18 + add x11, x11, x13 eor x12, x12, x10 - add x11, x11, x19 + add x11, x11, x15 eor x16, x16, x5 add x11, x11, x12 - add x15, x15, x16 + add x14, x14, x16 add x7, x7, x11 - add x11, x11, x15 + add x11, x11, x14 # Round 9 - mov x18, v4.d[1] - ldr x19, [x3], #8 + mov x13, v4.d[1] + ldr x15, [x3], #8 ext v10.16b, v4.16b, v5.16b, #8 ror x12, x7, #14 shl v8.2d, v3.2d, #45 @@ -452,7 +451,7 @@ L_sha512_len_neon_start: sri v9.2d, v3.2d, #61 eor x12, x12, x7, ror 41 eor v9.16b, v9.16b, v8.16b - eor x15, x14, x11, ror 39 + eor x14, x14, x11, ror 39 ushr v8.2d, v3.2d, #6 add x10, x10, x12 eor v9.16b, v9.16b, v8.16b @@ -464,45 +463,45 @@ L_sha512_len_neon_start: add v4.2d, v4.2d, v9.2d and x12, x12, x7 shl v8.2d, v10.2d, #63 - add x10, x10, x18 + add x10, x10, x13 sri v8.2d, v10.2d, #1 eor x12, x12, x9 tbl v9.16b, {v10.16b}, v11.16b - add x10, x10, x19 + add x10, x10, x15 eor v9.16b, v9.16b, v8.16b eor x17, x17, x4 ushr v10.2d, v10.2d, #7 add x10, x10, x12 eor v9.16b, v9.16b, v10.16b - add x15, x15, x17 + add x14, x14, x17 add v4.2d, v4.2d, v9.2d add x6, x6, x10 - add x10, x10, x15 + add x10, x10, x14 # Round 10 - mov x18, v5.d[0] - ldr x19, [x3], #8 + mov x13, v5.d[0] + ldr x15, [x3], #8 ror x12, x6, #14 ror x14, x10, #28 eor x12, x12, x6, ror 18 eor x14, x14, x10, ror 34 eor x12, x12, x6, ror 41 - eor x15, x14, x10, ror 39 + eor x14, x14, x10, ror 39 add x9, x9, x12 eor x17, x10, x11 eor x12, x7, x8 and x16, x17, x16 and x12, x12, x6 - add x9, x9, x18 + add x9, x9, x13 eor x12, x12, x8 - add x9, x9, x19 + add x9, x9, x15 eor x16, x16, x11 add x9, x9, x12 - add x15, x15, x16 + add x14, x14, x16 add x5, x5, x9 - add x9, x9, x15 + add x9, x9, x14 # Round 11 - mov x18, v5.d[1] - ldr x19, [x3], #8 + mov x13, v5.d[1] + ldr x15, [x3], #8 ext v10.16b, v5.16b, v6.16b, #8 ror x12, x5, #14 shl v8.2d, v4.2d, #45 @@ -514,7 +513,7 @@ L_sha512_len_neon_start: sri v9.2d, v4.2d, #61 eor x12, x12, x5, ror 41 eor v9.16b, v9.16b, v8.16b - eor x15, x14, x9, ror 39 + eor x14, x14, x9, ror 39 ushr v8.2d, v4.2d, #6 add x8, x8, x12 eor v9.16b, v9.16b, v8.16b @@ -526,45 +525,45 @@ L_sha512_len_neon_start: add v5.2d, v5.2d, v9.2d and x12, x12, x5 shl v8.2d, v10.2d, #63 - add x8, x8, x18 + add x8, x8, x13 sri v8.2d, v10.2d, #1 eor x12, x12, x7 tbl v9.16b, {v10.16b}, v11.16b - add x8, x8, x19 + add x8, x8, x15 eor v9.16b, v9.16b, v8.16b eor x17, x17, x10 ushr v10.2d, v10.2d, #7 add x8, x8, x12 eor v9.16b, v9.16b, v10.16b - add x15, x15, x17 + add x14, x14, x17 add v5.2d, v5.2d, v9.2d add x4, x4, x8 - add x8, x8, x15 + add x8, x8, x14 # Round 12 - mov x18, v6.d[0] - ldr x19, [x3], #8 + mov x13, v6.d[0] + ldr x15, [x3], #8 ror x12, x4, #14 ror x14, x8, #28 eor x12, x12, x4, ror 18 eor x14, x14, x8, ror 34 eor x12, x12, x4, ror 41 - eor x15, x14, x8, ror 39 + eor x14, x14, x8, ror 39 add x7, x7, x12 eor x17, x8, x9 eor x12, x5, x6 and x16, x17, x16 and x12, x12, x4 - add x7, x7, x18 + add x7, x7, x13 eor x12, x12, x6 - add x7, x7, x19 + add x7, x7, x15 eor x16, x16, x9 add x7, x7, x12 - add x15, x15, x16 + add x14, x14, x16 add x11, x11, x7 - add x7, x7, x15 + add x7, x7, x14 # Round 13 - mov x18, v6.d[1] - ldr x19, [x3], #8 + mov x13, v6.d[1] + ldr x15, [x3], #8 ext v10.16b, v6.16b, v7.16b, #8 ror x12, x11, #14 shl v8.2d, v5.2d, #45 @@ -576,7 +575,7 @@ L_sha512_len_neon_start: sri v9.2d, v5.2d, #61 eor x12, x12, x11, ror 41 eor v9.16b, v9.16b, v8.16b - eor x15, x14, x7, ror 39 + eor x14, x14, x7, ror 39 ushr v8.2d, v5.2d, #6 add x6, x6, x12 eor v9.16b, v9.16b, v8.16b @@ -588,45 +587,45 @@ L_sha512_len_neon_start: add v6.2d, v6.2d, v9.2d and x12, x12, x11 shl v8.2d, v10.2d, #63 - add x6, x6, x18 + add x6, x6, x13 sri v8.2d, v10.2d, #1 eor x12, x12, x5 tbl v9.16b, {v10.16b}, v11.16b - add x6, x6, x19 + add x6, x6, x15 eor v9.16b, v9.16b, v8.16b eor x17, x17, x8 ushr v10.2d, v10.2d, #7 add x6, x6, x12 eor v9.16b, v9.16b, v10.16b - add x15, x15, x17 + add x14, x14, x17 add v6.2d, v6.2d, v9.2d add x10, x10, x6 - add x6, x6, x15 + add x6, x6, x14 # Round 14 - mov x18, v7.d[0] - ldr x19, [x3], #8 + mov x13, v7.d[0] + ldr x15, [x3], #8 ror x12, x10, #14 ror x14, x6, #28 eor x12, x12, x10, ror 18 eor x14, x14, x6, ror 34 eor x12, x12, x10, ror 41 - eor x15, x14, x6, ror 39 + eor x14, x14, x6, ror 39 add x5, x5, x12 eor x17, x6, x7 eor x12, x11, x4 and x16, x17, x16 and x12, x12, x10 - add x5, x5, x18 + add x5, x5, x13 eor x12, x12, x4 - add x5, x5, x19 + add x5, x5, x15 eor x16, x16, x7 add x5, x5, x12 - add x15, x15, x16 + add x14, x14, x16 add x9, x9, x5 - add x5, x5, x15 + add x5, x5, x14 # Round 15 - mov x18, v7.d[1] - ldr x19, [x3], #8 + mov x13, v7.d[1] + ldr x15, [x3], #8 ext v10.16b, v7.16b, v0.16b, #8 ror x12, x9, #14 shl v8.2d, v6.2d, #45 @@ -638,7 +637,7 @@ L_sha512_len_neon_start: sri v9.2d, v6.2d, #61 eor x12, x12, x9, ror 41 eor v9.16b, v9.16b, v8.16b - eor x15, x14, x5, ror 39 + eor x14, x14, x5, ror 39 ushr v8.2d, v6.2d, #6 add x4, x4, x12 eor v9.16b, v9.16b, v8.16b @@ -650,382 +649,382 @@ L_sha512_len_neon_start: add v7.2d, v7.2d, v9.2d and x12, x12, x9 shl v8.2d, v10.2d, #63 - add x4, x4, x18 + add x4, x4, x13 sri v8.2d, v10.2d, #1 eor x12, x12, x11 tbl v9.16b, {v10.16b}, v11.16b - add x4, x4, x19 + add x4, x4, x15 eor v9.16b, v9.16b, v8.16b eor x17, x17, x6 ushr v10.2d, v10.2d, #7 add x4, x4, x12 eor v9.16b, v9.16b, v10.16b - add x15, x15, x17 + add x14, x14, x17 add v7.2d, v7.2d, v9.2d add x8, x8, x4 - add x4, x4, x15 - subs x28, x28, #1 + add x4, x4, x14 + subs x26, x26, #1 bne L_sha512_len_neon_start # Round 0 - mov x18, v0.d[0] - ldr x19, [x3], #8 + mov x13, v0.d[0] + ldr x15, [x3], #8 ror x12, x8, #14 ror x14, x4, #28 eor x12, x12, x8, ror 18 eor x14, x14, x4, ror 34 eor x12, x12, x8, ror 41 - eor x15, x14, x4, ror 39 + eor x14, x14, x4, ror 39 add x11, x11, x12 eor x17, x4, x5 eor x12, x9, x10 and x16, x17, x16 and x12, x12, x8 - add x11, x11, x18 + add x11, x11, x13 eor x12, x12, x10 - add x11, x11, x19 + add x11, x11, x15 eor x16, x16, x5 add x11, x11, x12 - add x15, x15, x16 + add x14, x14, x16 add x7, x7, x11 - add x11, x11, x15 + add x11, x11, x14 # Round 1 - mov x18, v0.d[1] - ldr x19, [x3], #8 + mov x13, v0.d[1] + ldr x15, [x3], #8 ror x12, x7, #14 ror x14, x11, #28 eor x12, x12, x7, ror 18 eor x14, x14, x11, ror 34 eor x12, x12, x7, ror 41 - eor x15, x14, x11, ror 39 + eor x14, x14, x11, ror 39 add x10, x10, x12 eor x16, x11, x4 eor x12, x8, x9 and x17, x16, x17 and x12, x12, x7 - add x10, x10, x18 + add x10, x10, x13 eor x12, x12, x9 - add x10, x10, x19 + add x10, x10, x15 eor x17, x17, x4 add x10, x10, x12 - add x15, x15, x17 + add x14, x14, x17 add x6, x6, x10 - add x10, x10, x15 + add x10, x10, x14 # Round 2 - mov x18, v1.d[0] - ldr x19, [x3], #8 + mov x13, v1.d[0] + ldr x15, [x3], #8 ror x12, x6, #14 ror x14, x10, #28 eor x12, x12, x6, ror 18 eor x14, x14, x10, ror 34 eor x12, x12, x6, ror 41 - eor x15, x14, x10, ror 39 + eor x14, x14, x10, ror 39 add x9, x9, x12 eor x17, x10, x11 eor x12, x7, x8 and x16, x17, x16 and x12, x12, x6 - add x9, x9, x18 + add x9, x9, x13 eor x12, x12, x8 - add x9, x9, x19 + add x9, x9, x15 eor x16, x16, x11 add x9, x9, x12 - add x15, x15, x16 + add x14, x14, x16 add x5, x5, x9 - add x9, x9, x15 + add x9, x9, x14 # Round 3 - mov x18, v1.d[1] - ldr x19, [x3], #8 + mov x13, v1.d[1] + ldr x15, [x3], #8 ror x12, x5, #14 ror x14, x9, #28 eor x12, x12, x5, ror 18 eor x14, x14, x9, ror 34 eor x12, x12, x5, ror 41 - eor x15, x14, x9, ror 39 + eor x14, x14, x9, ror 39 add x8, x8, x12 eor x16, x9, x10 eor x12, x6, x7 and x17, x16, x17 and x12, x12, x5 - add x8, x8, x18 + add x8, x8, x13 eor x12, x12, x7 - add x8, x8, x19 + add x8, x8, x15 eor x17, x17, x10 add x8, x8, x12 - add x15, x15, x17 + add x14, x14, x17 add x4, x4, x8 - add x8, x8, x15 + add x8, x8, x14 # Round 4 - mov x18, v2.d[0] - ldr x19, [x3], #8 + mov x13, v2.d[0] + ldr x15, [x3], #8 ror x12, x4, #14 ror x14, x8, #28 eor x12, x12, x4, ror 18 eor x14, x14, x8, ror 34 eor x12, x12, x4, ror 41 - eor x15, x14, x8, ror 39 + eor x14, x14, x8, ror 39 add x7, x7, x12 eor x17, x8, x9 eor x12, x5, x6 and x16, x17, x16 and x12, x12, x4 - add x7, x7, x18 + add x7, x7, x13 eor x12, x12, x6 - add x7, x7, x19 + add x7, x7, x15 eor x16, x16, x9 add x7, x7, x12 - add x15, x15, x16 + add x14, x14, x16 add x11, x11, x7 - add x7, x7, x15 + add x7, x7, x14 # Round 5 - mov x18, v2.d[1] - ldr x19, [x3], #8 + mov x13, v2.d[1] + ldr x15, [x3], #8 ror x12, x11, #14 ror x14, x7, #28 eor x12, x12, x11, ror 18 eor x14, x14, x7, ror 34 eor x12, x12, x11, ror 41 - eor x15, x14, x7, ror 39 + eor x14, x14, x7, ror 39 add x6, x6, x12 eor x16, x7, x8 eor x12, x4, x5 and x17, x16, x17 and x12, x12, x11 - add x6, x6, x18 + add x6, x6, x13 eor x12, x12, x5 - add x6, x6, x19 + add x6, x6, x15 eor x17, x17, x8 add x6, x6, x12 - add x15, x15, x17 + add x14, x14, x17 add x10, x10, x6 - add x6, x6, x15 + add x6, x6, x14 # Round 6 - mov x18, v3.d[0] - ldr x19, [x3], #8 + mov x13, v3.d[0] + ldr x15, [x3], #8 ror x12, x10, #14 ror x14, x6, #28 eor x12, x12, x10, ror 18 eor x14, x14, x6, ror 34 eor x12, x12, x10, ror 41 - eor x15, x14, x6, ror 39 + eor x14, x14, x6, ror 39 add x5, x5, x12 eor x17, x6, x7 eor x12, x11, x4 and x16, x17, x16 and x12, x12, x10 - add x5, x5, x18 + add x5, x5, x13 eor x12, x12, x4 - add x5, x5, x19 + add x5, x5, x15 eor x16, x16, x7 add x5, x5, x12 - add x15, x15, x16 + add x14, x14, x16 add x9, x9, x5 - add x5, x5, x15 + add x5, x5, x14 # Round 7 - mov x18, v3.d[1] - ldr x19, [x3], #8 + mov x13, v3.d[1] + ldr x15, [x3], #8 ror x12, x9, #14 ror x14, x5, #28 eor x12, x12, x9, ror 18 eor x14, x14, x5, ror 34 eor x12, x12, x9, ror 41 - eor x15, x14, x5, ror 39 + eor x14, x14, x5, ror 39 add x4, x4, x12 eor x16, x5, x6 eor x12, x10, x11 and x17, x16, x17 and x12, x12, x9 - add x4, x4, x18 + add x4, x4, x13 eor x12, x12, x11 - add x4, x4, x19 + add x4, x4, x15 eor x17, x17, x6 add x4, x4, x12 - add x15, x15, x17 + add x14, x14, x17 add x8, x8, x4 - add x4, x4, x15 + add x4, x4, x14 # Round 8 - mov x18, v4.d[0] - ldr x19, [x3], #8 + mov x13, v4.d[0] + ldr x15, [x3], #8 ror x12, x8, #14 ror x14, x4, #28 eor x12, x12, x8, ror 18 eor x14, x14, x4, ror 34 eor x12, x12, x8, ror 41 - eor x15, x14, x4, ror 39 + eor x14, x14, x4, ror 39 add x11, x11, x12 eor x17, x4, x5 eor x12, x9, x10 and x16, x17, x16 and x12, x12, x8 - add x11, x11, x18 + add x11, x11, x13 eor x12, x12, x10 - add x11, x11, x19 + add x11, x11, x15 eor x16, x16, x5 add x11, x11, x12 - add x15, x15, x16 + add x14, x14, x16 add x7, x7, x11 - add x11, x11, x15 + add x11, x11, x14 # Round 9 - mov x18, v4.d[1] - ldr x19, [x3], #8 + mov x13, v4.d[1] + ldr x15, [x3], #8 ror x12, x7, #14 ror x14, x11, #28 eor x12, x12, x7, ror 18 eor x14, x14, x11, ror 34 eor x12, x12, x7, ror 41 - eor x15, x14, x11, ror 39 + eor x14, x14, x11, ror 39 add x10, x10, x12 eor x16, x11, x4 eor x12, x8, x9 and x17, x16, x17 and x12, x12, x7 - add x10, x10, x18 + add x10, x10, x13 eor x12, x12, x9 - add x10, x10, x19 + add x10, x10, x15 eor x17, x17, x4 add x10, x10, x12 - add x15, x15, x17 + add x14, x14, x17 add x6, x6, x10 - add x10, x10, x15 + add x10, x10, x14 # Round 10 - mov x18, v5.d[0] - ldr x19, [x3], #8 + mov x13, v5.d[0] + ldr x15, [x3], #8 ror x12, x6, #14 ror x14, x10, #28 eor x12, x12, x6, ror 18 eor x14, x14, x10, ror 34 eor x12, x12, x6, ror 41 - eor x15, x14, x10, ror 39 + eor x14, x14, x10, ror 39 add x9, x9, x12 eor x17, x10, x11 eor x12, x7, x8 and x16, x17, x16 and x12, x12, x6 - add x9, x9, x18 + add x9, x9, x13 eor x12, x12, x8 - add x9, x9, x19 + add x9, x9, x15 eor x16, x16, x11 add x9, x9, x12 - add x15, x15, x16 + add x14, x14, x16 add x5, x5, x9 - add x9, x9, x15 + add x9, x9, x14 # Round 11 - mov x18, v5.d[1] - ldr x19, [x3], #8 + mov x13, v5.d[1] + ldr x15, [x3], #8 ror x12, x5, #14 ror x14, x9, #28 eor x12, x12, x5, ror 18 eor x14, x14, x9, ror 34 eor x12, x12, x5, ror 41 - eor x15, x14, x9, ror 39 + eor x14, x14, x9, ror 39 add x8, x8, x12 eor x16, x9, x10 eor x12, x6, x7 and x17, x16, x17 and x12, x12, x5 - add x8, x8, x18 + add x8, x8, x13 eor x12, x12, x7 - add x8, x8, x19 + add x8, x8, x15 eor x17, x17, x10 add x8, x8, x12 - add x15, x15, x17 + add x14, x14, x17 add x4, x4, x8 - add x8, x8, x15 + add x8, x8, x14 # Round 12 - mov x18, v6.d[0] - ldr x19, [x3], #8 + mov x13, v6.d[0] + ldr x15, [x3], #8 ror x12, x4, #14 ror x14, x8, #28 eor x12, x12, x4, ror 18 eor x14, x14, x8, ror 34 eor x12, x12, x4, ror 41 - eor x15, x14, x8, ror 39 + eor x14, x14, x8, ror 39 add x7, x7, x12 eor x17, x8, x9 eor x12, x5, x6 and x16, x17, x16 and x12, x12, x4 - add x7, x7, x18 + add x7, x7, x13 eor x12, x12, x6 - add x7, x7, x19 + add x7, x7, x15 eor x16, x16, x9 add x7, x7, x12 - add x15, x15, x16 + add x14, x14, x16 add x11, x11, x7 - add x7, x7, x15 + add x7, x7, x14 # Round 13 - mov x18, v6.d[1] - ldr x19, [x3], #8 + mov x13, v6.d[1] + ldr x15, [x3], #8 ror x12, x11, #14 ror x14, x7, #28 eor x12, x12, x11, ror 18 eor x14, x14, x7, ror 34 eor x12, x12, x11, ror 41 - eor x15, x14, x7, ror 39 + eor x14, x14, x7, ror 39 add x6, x6, x12 eor x16, x7, x8 eor x12, x4, x5 and x17, x16, x17 and x12, x12, x11 - add x6, x6, x18 + add x6, x6, x13 eor x12, x12, x5 - add x6, x6, x19 + add x6, x6, x15 eor x17, x17, x8 add x6, x6, x12 - add x15, x15, x17 + add x14, x14, x17 add x10, x10, x6 - add x6, x6, x15 + add x6, x6, x14 # Round 14 - mov x18, v7.d[0] - ldr x19, [x3], #8 + mov x13, v7.d[0] + ldr x15, [x3], #8 ror x12, x10, #14 ror x14, x6, #28 eor x12, x12, x10, ror 18 eor x14, x14, x6, ror 34 eor x12, x12, x10, ror 41 - eor x15, x14, x6, ror 39 + eor x14, x14, x6, ror 39 add x5, x5, x12 eor x17, x6, x7 eor x12, x11, x4 and x16, x17, x16 and x12, x12, x10 - add x5, x5, x18 + add x5, x5, x13 eor x12, x12, x4 - add x5, x5, x19 + add x5, x5, x15 eor x16, x16, x7 add x5, x5, x12 - add x15, x15, x16 + add x14, x14, x16 add x9, x9, x5 - add x5, x5, x15 + add x5, x5, x14 # Round 15 - mov x18, v7.d[1] - ldr x19, [x3], #8 + mov x13, v7.d[1] + ldr x15, [x3], #8 ror x12, x9, #14 ror x14, x5, #28 eor x12, x12, x9, ror 18 eor x14, x14, x5, ror 34 eor x12, x12, x9, ror 41 - eor x15, x14, x5, ror 39 + eor x14, x14, x5, ror 39 add x4, x4, x12 eor x16, x5, x6 eor x12, x10, x11 and x17, x16, x17 and x12, x12, x9 - add x4, x4, x18 + add x4, x4, x13 eor x12, x12, x11 - add x4, x4, x19 + add x4, x4, x15 eor x17, x17, x6 add x4, x4, x12 - add x15, x15, x17 + add x14, x14, x17 add x8, x8, x4 - add x4, x4, x15 - add x11, x11, x27 - add x10, x10, x26 - add x9, x9, x25 - add x8, x8, x24 - add x7, x7, x23 - add x6, x6, x22 - add x5, x5, x21 - add x4, x4, x20 + add x4, x4, x14 + add x11, x11, x25 + add x10, x10, x24 + add x9, x9, x23 + add x8, x8, x22 + add x7, x7, x21 + add x6, x6, x20 + add x5, x5, x19 + add x4, x4, x18 adr x3, L_SHA512_transform_neon_len_k subs w2, w2, #0x80 bne L_sha512_len_neon_begin @@ -1033,17 +1032,15 @@ L_sha512_len_neon_start: stp x6, x7, [x0, #16] stp x8, x9, [x0, #32] stp x10, x11, [x0, #48] - eor x0, x0, x0 ldr x17, [x29, #16] ldp x18, x19, [x29, #24] ldp x20, x21, [x29, #40] ldp x22, x23, [x29, #56] ldp x24, x25, [x29, #72] - ldp x26, x27, [x29, #88] - ldr x28, [x29, #104] - ldp d8, d9, [x29, #112] - ldp d10, d11, [x29, #128] - ldp x29, x30, [sp], #0x90 + ldr x26, [x29, #88] + ldp d8, d9, [x29, #96] + ldp d10, d11, [x29, #112] + ldp x29, x30, [sp], #0x80 ret .size Transform_Sha512_Len,.-Transform_Sha512_Len #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.c b/wolfcrypt/src/port/arm/armv8-sha512-asm.c index dbc5a7dee..6c1119556 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.c +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.c @@ -115,14 +115,14 @@ static const uint64_t L_SHA512_transform_neon_len_ror8[] = { 0x80f0e0d0c0b0a09UL, }; -int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) { __asm__ __volatile__ ( "stp x29, x30, [sp, #-16]!\n\t" "add x29, sp, #0\n\t" "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" - "adr x28, %[L_SHA512_transform_neon_len_ror8]\n\t" - "ld1 {v11.16b}, [x28]\n\t" + "adr x26, %[L_SHA512_transform_neon_len_ror8]\n\t" + "ld1 {v11.16b}, [x26]\n\t" /* Load digest into working vars */ "ldp x4, x5, [%x[sha512]]\n\t" "ldp x6, x7, [%x[sha512], #16]\n\t" @@ -134,54 +134,54 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) /* Load W */ /* Copy digest to add in at end */ "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%x[data]], #0x40\n\t" - "mov x20, x4\n\t" + "mov x18, x4\n\t" "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [%x[data]], #0x40\n\t" - "mov x21, x5\n\t" + "mov x19, x5\n\t" "rev64 v0.16b, v0.16b\n\t" - "mov x22, x6\n\t" + "mov x20, x6\n\t" "rev64 v1.16b, v1.16b\n\t" - "mov x23, x7\n\t" + "mov x21, x7\n\t" "rev64 v2.16b, v2.16b\n\t" - "mov x24, x8\n\t" + "mov x22, x8\n\t" "rev64 v3.16b, v3.16b\n\t" - "mov x25, x9\n\t" + "mov x23, x9\n\t" "rev64 v4.16b, v4.16b\n\t" - "mov x26, x10\n\t" + "mov x24, x10\n\t" "rev64 v5.16b, v5.16b\n\t" - "mov x27, x11\n\t" + "mov x25, x11\n\t" "rev64 v6.16b, v6.16b\n\t" "rev64 v7.16b, v7.16b\n\t" /* Pre-calc: b ^ c */ "eor x16, x5, x6\n\t" - "mov x28, #4\n\t" + "mov x26, #4\n\t" /* Start of 16 rounds */ "\n" "L_sha512_len_neon_start_%=: \n\t" /* Round 0 */ - "mov x18, v0.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v0.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x8, #14\n\t" "ror x14, x4, #28\n\t" "eor x12, x12, x8, ror 18\n\t" "eor x14, x14, x4, ror 34\n\t" "eor x12, x12, x8, ror 41\n\t" - "eor x15, x14, x4, ror 39\n\t" + "eor x14, x14, x4, ror 39\n\t" "add x11, x11, x12\n\t" "eor x17, x4, x5\n\t" "eor x12, x9, x10\n\t" "and x16, x17, x16\n\t" "and x12, x12, x8\n\t" - "add x11, x11, x18\n\t" + "add x11, x11, x13\n\t" "eor x12, x12, x10\n\t" - "add x11, x11, x19\n\t" + "add x11, x11, x15\n\t" "eor x16, x16, x5\n\t" "add x11, x11, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x7, x7, x11\n\t" - "add x11, x11, x15\n\t" + "add x11, x11, x14\n\t" /* Round 1 */ - "mov x18, v0.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v0.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ext v10.16b, v0.16b, v1.16b, #8\n\t" "ror x12, x7, #14\n\t" "shl v8.2d, v7.2d, #45\n\t" @@ -193,7 +193,7 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "sri v9.2d, v7.2d, #61\n\t" "eor x12, x12, x7, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x14, x11, ror 39\n\t" + "eor x14, x14, x11, ror 39\n\t" "ushr v8.2d, v7.2d, #6\n\t" "add x10, x10, x12\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" @@ -205,45 +205,45 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v0.2d, v0.2d, v9.2d\n\t" "and x12, x12, x7\n\t" "shl v8.2d, v10.2d, #63\n\t" - "add x10, x10, x18\n\t" + "add x10, x10, x13\n\t" "sri v8.2d, v10.2d, #1\n\t" "eor x12, x12, x9\n\t" "tbl v9.16b, {v10.16b}, v11.16b\n\t" - "add x10, x10, x19\n\t" + "add x10, x10, x15\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x4\n\t" "ushr v10.2d, v10.2d, #7\n\t" "add x10, x10, x12\n\t" "eor v9.16b, v9.16b, v10.16b\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add v0.2d, v0.2d, v9.2d\n\t" "add x6, x6, x10\n\t" - "add x10, x10, x15\n\t" + "add x10, x10, x14\n\t" /* Round 2 */ - "mov x18, v1.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v1.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x6, #14\n\t" "ror x14, x10, #28\n\t" "eor x12, x12, x6, ror 18\n\t" "eor x14, x14, x10, ror 34\n\t" "eor x12, x12, x6, ror 41\n\t" - "eor x15, x14, x10, ror 39\n\t" + "eor x14, x14, x10, ror 39\n\t" "add x9, x9, x12\n\t" "eor x17, x10, x11\n\t" "eor x12, x7, x8\n\t" "and x16, x17, x16\n\t" "and x12, x12, x6\n\t" - "add x9, x9, x18\n\t" + "add x9, x9, x13\n\t" "eor x12, x12, x8\n\t" - "add x9, x9, x19\n\t" + "add x9, x9, x15\n\t" "eor x16, x16, x11\n\t" "add x9, x9, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x5, x5, x9\n\t" - "add x9, x9, x15\n\t" + "add x9, x9, x14\n\t" /* Round 3 */ - "mov x18, v1.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v1.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ext v10.16b, v1.16b, v2.16b, #8\n\t" "ror x12, x5, #14\n\t" "shl v8.2d, v0.2d, #45\n\t" @@ -255,7 +255,7 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "sri v9.2d, v0.2d, #61\n\t" "eor x12, x12, x5, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x14, x9, ror 39\n\t" + "eor x14, x14, x9, ror 39\n\t" "ushr v8.2d, v0.2d, #6\n\t" "add x8, x8, x12\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" @@ -267,45 +267,45 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v1.2d, v1.2d, v9.2d\n\t" "and x12, x12, x5\n\t" "shl v8.2d, v10.2d, #63\n\t" - "add x8, x8, x18\n\t" + "add x8, x8, x13\n\t" "sri v8.2d, v10.2d, #1\n\t" "eor x12, x12, x7\n\t" "tbl v9.16b, {v10.16b}, v11.16b\n\t" - "add x8, x8, x19\n\t" + "add x8, x8, x15\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x10\n\t" "ushr v10.2d, v10.2d, #7\n\t" "add x8, x8, x12\n\t" "eor v9.16b, v9.16b, v10.16b\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add v1.2d, v1.2d, v9.2d\n\t" "add x4, x4, x8\n\t" - "add x8, x8, x15\n\t" + "add x8, x8, x14\n\t" /* Round 4 */ - "mov x18, v2.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v2.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x4, #14\n\t" "ror x14, x8, #28\n\t" "eor x12, x12, x4, ror 18\n\t" "eor x14, x14, x8, ror 34\n\t" "eor x12, x12, x4, ror 41\n\t" - "eor x15, x14, x8, ror 39\n\t" + "eor x14, x14, x8, ror 39\n\t" "add x7, x7, x12\n\t" "eor x17, x8, x9\n\t" "eor x12, x5, x6\n\t" "and x16, x17, x16\n\t" "and x12, x12, x4\n\t" - "add x7, x7, x18\n\t" + "add x7, x7, x13\n\t" "eor x12, x12, x6\n\t" - "add x7, x7, x19\n\t" + "add x7, x7, x15\n\t" "eor x16, x16, x9\n\t" "add x7, x7, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x11, x11, x7\n\t" - "add x7, x7, x15\n\t" + "add x7, x7, x14\n\t" /* Round 5 */ - "mov x18, v2.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v2.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ext v10.16b, v2.16b, v3.16b, #8\n\t" "ror x12, x11, #14\n\t" "shl v8.2d, v1.2d, #45\n\t" @@ -317,7 +317,7 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "sri v9.2d, v1.2d, #61\n\t" "eor x12, x12, x11, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x14, x7, ror 39\n\t" + "eor x14, x14, x7, ror 39\n\t" "ushr v8.2d, v1.2d, #6\n\t" "add x6, x6, x12\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" @@ -329,45 +329,45 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v2.2d, v2.2d, v9.2d\n\t" "and x12, x12, x11\n\t" "shl v8.2d, v10.2d, #63\n\t" - "add x6, x6, x18\n\t" + "add x6, x6, x13\n\t" "sri v8.2d, v10.2d, #1\n\t" "eor x12, x12, x5\n\t" "tbl v9.16b, {v10.16b}, v11.16b\n\t" - "add x6, x6, x19\n\t" + "add x6, x6, x15\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x8\n\t" "ushr v10.2d, v10.2d, #7\n\t" "add x6, x6, x12\n\t" "eor v9.16b, v9.16b, v10.16b\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add v2.2d, v2.2d, v9.2d\n\t" "add x10, x10, x6\n\t" - "add x6, x6, x15\n\t" + "add x6, x6, x14\n\t" /* Round 6 */ - "mov x18, v3.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v3.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x10, #14\n\t" "ror x14, x6, #28\n\t" "eor x12, x12, x10, ror 18\n\t" "eor x14, x14, x6, ror 34\n\t" "eor x12, x12, x10, ror 41\n\t" - "eor x15, x14, x6, ror 39\n\t" + "eor x14, x14, x6, ror 39\n\t" "add x5, x5, x12\n\t" "eor x17, x6, x7\n\t" "eor x12, x11, x4\n\t" "and x16, x17, x16\n\t" "and x12, x12, x10\n\t" - "add x5, x5, x18\n\t" + "add x5, x5, x13\n\t" "eor x12, x12, x4\n\t" - "add x5, x5, x19\n\t" + "add x5, x5, x15\n\t" "eor x16, x16, x7\n\t" "add x5, x5, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x9, x9, x5\n\t" - "add x5, x5, x15\n\t" + "add x5, x5, x14\n\t" /* Round 7 */ - "mov x18, v3.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v3.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ext v10.16b, v3.16b, v4.16b, #8\n\t" "ror x12, x9, #14\n\t" "shl v8.2d, v2.2d, #45\n\t" @@ -379,7 +379,7 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "sri v9.2d, v2.2d, #61\n\t" "eor x12, x12, x9, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x14, x5, ror 39\n\t" + "eor x14, x14, x5, ror 39\n\t" "ushr v8.2d, v2.2d, #6\n\t" "add x4, x4, x12\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" @@ -391,45 +391,45 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v3.2d, v3.2d, v9.2d\n\t" "and x12, x12, x9\n\t" "shl v8.2d, v10.2d, #63\n\t" - "add x4, x4, x18\n\t" + "add x4, x4, x13\n\t" "sri v8.2d, v10.2d, #1\n\t" "eor x12, x12, x11\n\t" "tbl v9.16b, {v10.16b}, v11.16b\n\t" - "add x4, x4, x19\n\t" + "add x4, x4, x15\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x6\n\t" "ushr v10.2d, v10.2d, #7\n\t" "add x4, x4, x12\n\t" "eor v9.16b, v9.16b, v10.16b\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add v3.2d, v3.2d, v9.2d\n\t" "add x8, x8, x4\n\t" - "add x4, x4, x15\n\t" + "add x4, x4, x14\n\t" /* Round 8 */ - "mov x18, v4.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v4.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x8, #14\n\t" "ror x14, x4, #28\n\t" "eor x12, x12, x8, ror 18\n\t" "eor x14, x14, x4, ror 34\n\t" "eor x12, x12, x8, ror 41\n\t" - "eor x15, x14, x4, ror 39\n\t" + "eor x14, x14, x4, ror 39\n\t" "add x11, x11, x12\n\t" "eor x17, x4, x5\n\t" "eor x12, x9, x10\n\t" "and x16, x17, x16\n\t" "and x12, x12, x8\n\t" - "add x11, x11, x18\n\t" + "add x11, x11, x13\n\t" "eor x12, x12, x10\n\t" - "add x11, x11, x19\n\t" + "add x11, x11, x15\n\t" "eor x16, x16, x5\n\t" "add x11, x11, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x7, x7, x11\n\t" - "add x11, x11, x15\n\t" + "add x11, x11, x14\n\t" /* Round 9 */ - "mov x18, v4.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v4.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ext v10.16b, v4.16b, v5.16b, #8\n\t" "ror x12, x7, #14\n\t" "shl v8.2d, v3.2d, #45\n\t" @@ -441,7 +441,7 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "sri v9.2d, v3.2d, #61\n\t" "eor x12, x12, x7, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x14, x11, ror 39\n\t" + "eor x14, x14, x11, ror 39\n\t" "ushr v8.2d, v3.2d, #6\n\t" "add x10, x10, x12\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" @@ -453,45 +453,45 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v4.2d, v4.2d, v9.2d\n\t" "and x12, x12, x7\n\t" "shl v8.2d, v10.2d, #63\n\t" - "add x10, x10, x18\n\t" + "add x10, x10, x13\n\t" "sri v8.2d, v10.2d, #1\n\t" "eor x12, x12, x9\n\t" "tbl v9.16b, {v10.16b}, v11.16b\n\t" - "add x10, x10, x19\n\t" + "add x10, x10, x15\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x4\n\t" "ushr v10.2d, v10.2d, #7\n\t" "add x10, x10, x12\n\t" "eor v9.16b, v9.16b, v10.16b\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add v4.2d, v4.2d, v9.2d\n\t" "add x6, x6, x10\n\t" - "add x10, x10, x15\n\t" + "add x10, x10, x14\n\t" /* Round 10 */ - "mov x18, v5.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v5.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x6, #14\n\t" "ror x14, x10, #28\n\t" "eor x12, x12, x6, ror 18\n\t" "eor x14, x14, x10, ror 34\n\t" "eor x12, x12, x6, ror 41\n\t" - "eor x15, x14, x10, ror 39\n\t" + "eor x14, x14, x10, ror 39\n\t" "add x9, x9, x12\n\t" "eor x17, x10, x11\n\t" "eor x12, x7, x8\n\t" "and x16, x17, x16\n\t" "and x12, x12, x6\n\t" - "add x9, x9, x18\n\t" + "add x9, x9, x13\n\t" "eor x12, x12, x8\n\t" - "add x9, x9, x19\n\t" + "add x9, x9, x15\n\t" "eor x16, x16, x11\n\t" "add x9, x9, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x5, x5, x9\n\t" - "add x9, x9, x15\n\t" + "add x9, x9, x14\n\t" /* Round 11 */ - "mov x18, v5.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v5.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ext v10.16b, v5.16b, v6.16b, #8\n\t" "ror x12, x5, #14\n\t" "shl v8.2d, v4.2d, #45\n\t" @@ -503,7 +503,7 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "sri v9.2d, v4.2d, #61\n\t" "eor x12, x12, x5, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x14, x9, ror 39\n\t" + "eor x14, x14, x9, ror 39\n\t" "ushr v8.2d, v4.2d, #6\n\t" "add x8, x8, x12\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" @@ -515,45 +515,45 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v5.2d, v5.2d, v9.2d\n\t" "and x12, x12, x5\n\t" "shl v8.2d, v10.2d, #63\n\t" - "add x8, x8, x18\n\t" + "add x8, x8, x13\n\t" "sri v8.2d, v10.2d, #1\n\t" "eor x12, x12, x7\n\t" "tbl v9.16b, {v10.16b}, v11.16b\n\t" - "add x8, x8, x19\n\t" + "add x8, x8, x15\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x10\n\t" "ushr v10.2d, v10.2d, #7\n\t" "add x8, x8, x12\n\t" "eor v9.16b, v9.16b, v10.16b\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add v5.2d, v5.2d, v9.2d\n\t" "add x4, x4, x8\n\t" - "add x8, x8, x15\n\t" + "add x8, x8, x14\n\t" /* Round 12 */ - "mov x18, v6.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v6.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x4, #14\n\t" "ror x14, x8, #28\n\t" "eor x12, x12, x4, ror 18\n\t" "eor x14, x14, x8, ror 34\n\t" "eor x12, x12, x4, ror 41\n\t" - "eor x15, x14, x8, ror 39\n\t" + "eor x14, x14, x8, ror 39\n\t" "add x7, x7, x12\n\t" "eor x17, x8, x9\n\t" "eor x12, x5, x6\n\t" "and x16, x17, x16\n\t" "and x12, x12, x4\n\t" - "add x7, x7, x18\n\t" + "add x7, x7, x13\n\t" "eor x12, x12, x6\n\t" - "add x7, x7, x19\n\t" + "add x7, x7, x15\n\t" "eor x16, x16, x9\n\t" "add x7, x7, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x11, x11, x7\n\t" - "add x7, x7, x15\n\t" + "add x7, x7, x14\n\t" /* Round 13 */ - "mov x18, v6.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v6.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ext v10.16b, v6.16b, v7.16b, #8\n\t" "ror x12, x11, #14\n\t" "shl v8.2d, v5.2d, #45\n\t" @@ -565,7 +565,7 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "sri v9.2d, v5.2d, #61\n\t" "eor x12, x12, x11, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x14, x7, ror 39\n\t" + "eor x14, x14, x7, ror 39\n\t" "ushr v8.2d, v5.2d, #6\n\t" "add x6, x6, x12\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" @@ -577,45 +577,45 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v6.2d, v6.2d, v9.2d\n\t" "and x12, x12, x11\n\t" "shl v8.2d, v10.2d, #63\n\t" - "add x6, x6, x18\n\t" + "add x6, x6, x13\n\t" "sri v8.2d, v10.2d, #1\n\t" "eor x12, x12, x5\n\t" "tbl v9.16b, {v10.16b}, v11.16b\n\t" - "add x6, x6, x19\n\t" + "add x6, x6, x15\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x8\n\t" "ushr v10.2d, v10.2d, #7\n\t" "add x6, x6, x12\n\t" "eor v9.16b, v9.16b, v10.16b\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add v6.2d, v6.2d, v9.2d\n\t" "add x10, x10, x6\n\t" - "add x6, x6, x15\n\t" + "add x6, x6, x14\n\t" /* Round 14 */ - "mov x18, v7.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v7.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x10, #14\n\t" "ror x14, x6, #28\n\t" "eor x12, x12, x10, ror 18\n\t" "eor x14, x14, x6, ror 34\n\t" "eor x12, x12, x10, ror 41\n\t" - "eor x15, x14, x6, ror 39\n\t" + "eor x14, x14, x6, ror 39\n\t" "add x5, x5, x12\n\t" "eor x17, x6, x7\n\t" "eor x12, x11, x4\n\t" "and x16, x17, x16\n\t" "and x12, x12, x10\n\t" - "add x5, x5, x18\n\t" + "add x5, x5, x13\n\t" "eor x12, x12, x4\n\t" - "add x5, x5, x19\n\t" + "add x5, x5, x15\n\t" "eor x16, x16, x7\n\t" "add x5, x5, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x9, x9, x5\n\t" - "add x5, x5, x15\n\t" + "add x5, x5, x14\n\t" /* Round 15 */ - "mov x18, v7.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v7.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ext v10.16b, v7.16b, v0.16b, #8\n\t" "ror x12, x9, #14\n\t" "shl v8.2d, v6.2d, #45\n\t" @@ -627,7 +627,7 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "sri v9.2d, v6.2d, #61\n\t" "eor x12, x12, x9, ror 41\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" - "eor x15, x14, x5, ror 39\n\t" + "eor x14, x14, x5, ror 39\n\t" "ushr v8.2d, v6.2d, #6\n\t" "add x4, x4, x12\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" @@ -639,382 +639,382 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "add v7.2d, v7.2d, v9.2d\n\t" "and x12, x12, x9\n\t" "shl v8.2d, v10.2d, #63\n\t" - "add x4, x4, x18\n\t" + "add x4, x4, x13\n\t" "sri v8.2d, v10.2d, #1\n\t" "eor x12, x12, x11\n\t" "tbl v9.16b, {v10.16b}, v11.16b\n\t" - "add x4, x4, x19\n\t" + "add x4, x4, x15\n\t" "eor v9.16b, v9.16b, v8.16b\n\t" "eor x17, x17, x6\n\t" "ushr v10.2d, v10.2d, #7\n\t" "add x4, x4, x12\n\t" "eor v9.16b, v9.16b, v10.16b\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add v7.2d, v7.2d, v9.2d\n\t" "add x8, x8, x4\n\t" - "add x4, x4, x15\n\t" - "subs x28, x28, #1\n\t" + "add x4, x4, x14\n\t" + "subs x26, x26, #1\n\t" "bne L_sha512_len_neon_start_%=\n\t" /* Round 0 */ - "mov x18, v0.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v0.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x8, #14\n\t" "ror x14, x4, #28\n\t" "eor x12, x12, x8, ror 18\n\t" "eor x14, x14, x4, ror 34\n\t" "eor x12, x12, x8, ror 41\n\t" - "eor x15, x14, x4, ror 39\n\t" + "eor x14, x14, x4, ror 39\n\t" "add x11, x11, x12\n\t" "eor x17, x4, x5\n\t" "eor x12, x9, x10\n\t" "and x16, x17, x16\n\t" "and x12, x12, x8\n\t" - "add x11, x11, x18\n\t" + "add x11, x11, x13\n\t" "eor x12, x12, x10\n\t" - "add x11, x11, x19\n\t" + "add x11, x11, x15\n\t" "eor x16, x16, x5\n\t" "add x11, x11, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x7, x7, x11\n\t" - "add x11, x11, x15\n\t" + "add x11, x11, x14\n\t" /* Round 1 */ - "mov x18, v0.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v0.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x7, #14\n\t" "ror x14, x11, #28\n\t" "eor x12, x12, x7, ror 18\n\t" "eor x14, x14, x11, ror 34\n\t" "eor x12, x12, x7, ror 41\n\t" - "eor x15, x14, x11, ror 39\n\t" + "eor x14, x14, x11, ror 39\n\t" "add x10, x10, x12\n\t" "eor x16, x11, x4\n\t" "eor x12, x8, x9\n\t" "and x17, x16, x17\n\t" "and x12, x12, x7\n\t" - "add x10, x10, x18\n\t" + "add x10, x10, x13\n\t" "eor x12, x12, x9\n\t" - "add x10, x10, x19\n\t" + "add x10, x10, x15\n\t" "eor x17, x17, x4\n\t" "add x10, x10, x12\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add x6, x6, x10\n\t" - "add x10, x10, x15\n\t" + "add x10, x10, x14\n\t" /* Round 2 */ - "mov x18, v1.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v1.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x6, #14\n\t" "ror x14, x10, #28\n\t" "eor x12, x12, x6, ror 18\n\t" "eor x14, x14, x10, ror 34\n\t" "eor x12, x12, x6, ror 41\n\t" - "eor x15, x14, x10, ror 39\n\t" + "eor x14, x14, x10, ror 39\n\t" "add x9, x9, x12\n\t" "eor x17, x10, x11\n\t" "eor x12, x7, x8\n\t" "and x16, x17, x16\n\t" "and x12, x12, x6\n\t" - "add x9, x9, x18\n\t" + "add x9, x9, x13\n\t" "eor x12, x12, x8\n\t" - "add x9, x9, x19\n\t" + "add x9, x9, x15\n\t" "eor x16, x16, x11\n\t" "add x9, x9, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x5, x5, x9\n\t" - "add x9, x9, x15\n\t" + "add x9, x9, x14\n\t" /* Round 3 */ - "mov x18, v1.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v1.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x5, #14\n\t" "ror x14, x9, #28\n\t" "eor x12, x12, x5, ror 18\n\t" "eor x14, x14, x9, ror 34\n\t" "eor x12, x12, x5, ror 41\n\t" - "eor x15, x14, x9, ror 39\n\t" + "eor x14, x14, x9, ror 39\n\t" "add x8, x8, x12\n\t" "eor x16, x9, x10\n\t" "eor x12, x6, x7\n\t" "and x17, x16, x17\n\t" "and x12, x12, x5\n\t" - "add x8, x8, x18\n\t" + "add x8, x8, x13\n\t" "eor x12, x12, x7\n\t" - "add x8, x8, x19\n\t" + "add x8, x8, x15\n\t" "eor x17, x17, x10\n\t" "add x8, x8, x12\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add x4, x4, x8\n\t" - "add x8, x8, x15\n\t" + "add x8, x8, x14\n\t" /* Round 4 */ - "mov x18, v2.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v2.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x4, #14\n\t" "ror x14, x8, #28\n\t" "eor x12, x12, x4, ror 18\n\t" "eor x14, x14, x8, ror 34\n\t" "eor x12, x12, x4, ror 41\n\t" - "eor x15, x14, x8, ror 39\n\t" + "eor x14, x14, x8, ror 39\n\t" "add x7, x7, x12\n\t" "eor x17, x8, x9\n\t" "eor x12, x5, x6\n\t" "and x16, x17, x16\n\t" "and x12, x12, x4\n\t" - "add x7, x7, x18\n\t" + "add x7, x7, x13\n\t" "eor x12, x12, x6\n\t" - "add x7, x7, x19\n\t" + "add x7, x7, x15\n\t" "eor x16, x16, x9\n\t" "add x7, x7, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x11, x11, x7\n\t" - "add x7, x7, x15\n\t" + "add x7, x7, x14\n\t" /* Round 5 */ - "mov x18, v2.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v2.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x11, #14\n\t" "ror x14, x7, #28\n\t" "eor x12, x12, x11, ror 18\n\t" "eor x14, x14, x7, ror 34\n\t" "eor x12, x12, x11, ror 41\n\t" - "eor x15, x14, x7, ror 39\n\t" + "eor x14, x14, x7, ror 39\n\t" "add x6, x6, x12\n\t" "eor x16, x7, x8\n\t" "eor x12, x4, x5\n\t" "and x17, x16, x17\n\t" "and x12, x12, x11\n\t" - "add x6, x6, x18\n\t" + "add x6, x6, x13\n\t" "eor x12, x12, x5\n\t" - "add x6, x6, x19\n\t" + "add x6, x6, x15\n\t" "eor x17, x17, x8\n\t" "add x6, x6, x12\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add x10, x10, x6\n\t" - "add x6, x6, x15\n\t" + "add x6, x6, x14\n\t" /* Round 6 */ - "mov x18, v3.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v3.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x10, #14\n\t" "ror x14, x6, #28\n\t" "eor x12, x12, x10, ror 18\n\t" "eor x14, x14, x6, ror 34\n\t" "eor x12, x12, x10, ror 41\n\t" - "eor x15, x14, x6, ror 39\n\t" + "eor x14, x14, x6, ror 39\n\t" "add x5, x5, x12\n\t" "eor x17, x6, x7\n\t" "eor x12, x11, x4\n\t" "and x16, x17, x16\n\t" "and x12, x12, x10\n\t" - "add x5, x5, x18\n\t" + "add x5, x5, x13\n\t" "eor x12, x12, x4\n\t" - "add x5, x5, x19\n\t" + "add x5, x5, x15\n\t" "eor x16, x16, x7\n\t" "add x5, x5, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x9, x9, x5\n\t" - "add x5, x5, x15\n\t" + "add x5, x5, x14\n\t" /* Round 7 */ - "mov x18, v3.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v3.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x9, #14\n\t" "ror x14, x5, #28\n\t" "eor x12, x12, x9, ror 18\n\t" "eor x14, x14, x5, ror 34\n\t" "eor x12, x12, x9, ror 41\n\t" - "eor x15, x14, x5, ror 39\n\t" + "eor x14, x14, x5, ror 39\n\t" "add x4, x4, x12\n\t" "eor x16, x5, x6\n\t" "eor x12, x10, x11\n\t" "and x17, x16, x17\n\t" "and x12, x12, x9\n\t" - "add x4, x4, x18\n\t" + "add x4, x4, x13\n\t" "eor x12, x12, x11\n\t" - "add x4, x4, x19\n\t" + "add x4, x4, x15\n\t" "eor x17, x17, x6\n\t" "add x4, x4, x12\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add x8, x8, x4\n\t" - "add x4, x4, x15\n\t" + "add x4, x4, x14\n\t" /* Round 8 */ - "mov x18, v4.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v4.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x8, #14\n\t" "ror x14, x4, #28\n\t" "eor x12, x12, x8, ror 18\n\t" "eor x14, x14, x4, ror 34\n\t" "eor x12, x12, x8, ror 41\n\t" - "eor x15, x14, x4, ror 39\n\t" + "eor x14, x14, x4, ror 39\n\t" "add x11, x11, x12\n\t" "eor x17, x4, x5\n\t" "eor x12, x9, x10\n\t" "and x16, x17, x16\n\t" "and x12, x12, x8\n\t" - "add x11, x11, x18\n\t" + "add x11, x11, x13\n\t" "eor x12, x12, x10\n\t" - "add x11, x11, x19\n\t" + "add x11, x11, x15\n\t" "eor x16, x16, x5\n\t" "add x11, x11, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x7, x7, x11\n\t" - "add x11, x11, x15\n\t" + "add x11, x11, x14\n\t" /* Round 9 */ - "mov x18, v4.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v4.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x7, #14\n\t" "ror x14, x11, #28\n\t" "eor x12, x12, x7, ror 18\n\t" "eor x14, x14, x11, ror 34\n\t" "eor x12, x12, x7, ror 41\n\t" - "eor x15, x14, x11, ror 39\n\t" + "eor x14, x14, x11, ror 39\n\t" "add x10, x10, x12\n\t" "eor x16, x11, x4\n\t" "eor x12, x8, x9\n\t" "and x17, x16, x17\n\t" "and x12, x12, x7\n\t" - "add x10, x10, x18\n\t" + "add x10, x10, x13\n\t" "eor x12, x12, x9\n\t" - "add x10, x10, x19\n\t" + "add x10, x10, x15\n\t" "eor x17, x17, x4\n\t" "add x10, x10, x12\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add x6, x6, x10\n\t" - "add x10, x10, x15\n\t" + "add x10, x10, x14\n\t" /* Round 10 */ - "mov x18, v5.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v5.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x6, #14\n\t" "ror x14, x10, #28\n\t" "eor x12, x12, x6, ror 18\n\t" "eor x14, x14, x10, ror 34\n\t" "eor x12, x12, x6, ror 41\n\t" - "eor x15, x14, x10, ror 39\n\t" + "eor x14, x14, x10, ror 39\n\t" "add x9, x9, x12\n\t" "eor x17, x10, x11\n\t" "eor x12, x7, x8\n\t" "and x16, x17, x16\n\t" "and x12, x12, x6\n\t" - "add x9, x9, x18\n\t" + "add x9, x9, x13\n\t" "eor x12, x12, x8\n\t" - "add x9, x9, x19\n\t" + "add x9, x9, x15\n\t" "eor x16, x16, x11\n\t" "add x9, x9, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x5, x5, x9\n\t" - "add x9, x9, x15\n\t" + "add x9, x9, x14\n\t" /* Round 11 */ - "mov x18, v5.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v5.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x5, #14\n\t" "ror x14, x9, #28\n\t" "eor x12, x12, x5, ror 18\n\t" "eor x14, x14, x9, ror 34\n\t" "eor x12, x12, x5, ror 41\n\t" - "eor x15, x14, x9, ror 39\n\t" + "eor x14, x14, x9, ror 39\n\t" "add x8, x8, x12\n\t" "eor x16, x9, x10\n\t" "eor x12, x6, x7\n\t" "and x17, x16, x17\n\t" "and x12, x12, x5\n\t" - "add x8, x8, x18\n\t" + "add x8, x8, x13\n\t" "eor x12, x12, x7\n\t" - "add x8, x8, x19\n\t" + "add x8, x8, x15\n\t" "eor x17, x17, x10\n\t" "add x8, x8, x12\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add x4, x4, x8\n\t" - "add x8, x8, x15\n\t" + "add x8, x8, x14\n\t" /* Round 12 */ - "mov x18, v6.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v6.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x4, #14\n\t" "ror x14, x8, #28\n\t" "eor x12, x12, x4, ror 18\n\t" "eor x14, x14, x8, ror 34\n\t" "eor x12, x12, x4, ror 41\n\t" - "eor x15, x14, x8, ror 39\n\t" + "eor x14, x14, x8, ror 39\n\t" "add x7, x7, x12\n\t" "eor x17, x8, x9\n\t" "eor x12, x5, x6\n\t" "and x16, x17, x16\n\t" "and x12, x12, x4\n\t" - "add x7, x7, x18\n\t" + "add x7, x7, x13\n\t" "eor x12, x12, x6\n\t" - "add x7, x7, x19\n\t" + "add x7, x7, x15\n\t" "eor x16, x16, x9\n\t" "add x7, x7, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x11, x11, x7\n\t" - "add x7, x7, x15\n\t" + "add x7, x7, x14\n\t" /* Round 13 */ - "mov x18, v6.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v6.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x11, #14\n\t" "ror x14, x7, #28\n\t" "eor x12, x12, x11, ror 18\n\t" "eor x14, x14, x7, ror 34\n\t" "eor x12, x12, x11, ror 41\n\t" - "eor x15, x14, x7, ror 39\n\t" + "eor x14, x14, x7, ror 39\n\t" "add x6, x6, x12\n\t" "eor x16, x7, x8\n\t" "eor x12, x4, x5\n\t" "and x17, x16, x17\n\t" "and x12, x12, x11\n\t" - "add x6, x6, x18\n\t" + "add x6, x6, x13\n\t" "eor x12, x12, x5\n\t" - "add x6, x6, x19\n\t" + "add x6, x6, x15\n\t" "eor x17, x17, x8\n\t" "add x6, x6, x12\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add x10, x10, x6\n\t" - "add x6, x6, x15\n\t" + "add x6, x6, x14\n\t" /* Round 14 */ - "mov x18, v7.d[0]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v7.d[0]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x10, #14\n\t" "ror x14, x6, #28\n\t" "eor x12, x12, x10, ror 18\n\t" "eor x14, x14, x6, ror 34\n\t" "eor x12, x12, x10, ror 41\n\t" - "eor x15, x14, x6, ror 39\n\t" + "eor x14, x14, x6, ror 39\n\t" "add x5, x5, x12\n\t" "eor x17, x6, x7\n\t" "eor x12, x11, x4\n\t" "and x16, x17, x16\n\t" "and x12, x12, x10\n\t" - "add x5, x5, x18\n\t" + "add x5, x5, x13\n\t" "eor x12, x12, x4\n\t" - "add x5, x5, x19\n\t" + "add x5, x5, x15\n\t" "eor x16, x16, x7\n\t" "add x5, x5, x12\n\t" - "add x15, x15, x16\n\t" + "add x14, x14, x16\n\t" "add x9, x9, x5\n\t" - "add x5, x5, x15\n\t" + "add x5, x5, x14\n\t" /* Round 15 */ - "mov x18, v7.d[1]\n\t" - "ldr x19, [x3], #8\n\t" + "mov x13, v7.d[1]\n\t" + "ldr x15, [x3], #8\n\t" "ror x12, x9, #14\n\t" "ror x14, x5, #28\n\t" "eor x12, x12, x9, ror 18\n\t" "eor x14, x14, x5, ror 34\n\t" "eor x12, x12, x9, ror 41\n\t" - "eor x15, x14, x5, ror 39\n\t" + "eor x14, x14, x5, ror 39\n\t" "add x4, x4, x12\n\t" "eor x16, x5, x6\n\t" "eor x12, x10, x11\n\t" "and x17, x16, x17\n\t" "and x12, x12, x9\n\t" - "add x4, x4, x18\n\t" + "add x4, x4, x13\n\t" "eor x12, x12, x11\n\t" - "add x4, x4, x19\n\t" + "add x4, x4, x15\n\t" "eor x17, x17, x6\n\t" "add x4, x4, x12\n\t" - "add x15, x15, x17\n\t" + "add x14, x14, x17\n\t" "add x8, x8, x4\n\t" - "add x4, x4, x15\n\t" - "add x11, x11, x27\n\t" - "add x10, x10, x26\n\t" - "add x9, x9, x25\n\t" - "add x8, x8, x24\n\t" - "add x7, x7, x23\n\t" - "add x6, x6, x22\n\t" - "add x5, x5, x21\n\t" - "add x4, x4, x20\n\t" + "add x4, x4, x14\n\t" + "add x11, x11, x25\n\t" + "add x10, x10, x24\n\t" + "add x9, x9, x23\n\t" + "add x8, x8, x22\n\t" + "add x7, x7, x21\n\t" + "add x6, x6, x20\n\t" + "add x5, x5, x19\n\t" + "add x4, x4, x18\n\t" "adr x3, %[L_SHA512_transform_neon_len_k]\n\t" "subs %w[len], %w[len], #0x80\n\t" "bne L_sha512_len_neon_begin_%=\n\t" @@ -1022,13 +1022,11 @@ int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "stp x6, x7, [%x[sha512], #16]\n\t" "stp x8, x9, [%x[sha512], #32]\n\t" "stp x10, x11, [%x[sha512], #48]\n\t" - "eor x0, x0, x0\n\t" "ldp x29, x30, [sp], #16\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) : [L_SHA512_transform_neon_len_k] "S" (L_SHA512_transform_neon_len_k), [L_SHA512_transform_neon_len_ror8] "S" (L_SHA512_transform_neon_len_ror8) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" ); - return (uint32_t)(size_t)sha512; } #endif /* __aarch64__ */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512.c b/wolfcrypt/src/port/arm/armv8-sha512.c index 7f33a57ad..06fa167b3 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512.c +++ b/wolfcrypt/src/port/arm/armv8-sha512.c @@ -93,7 +93,7 @@ int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) #endif /* WOLFSSL_SHA512 */ -#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) +#ifndef WOLFSSL_ARMASM static const word64 K512[80] = { W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), @@ -177,7 +177,7 @@ static const word64 K512[80] = { h(i) += S0(a(i)) + Maj(a(i),b(i),c(i)) #define DATA sha512->buffer -static int Transform_Sha512(wc_Sha512* sha512) +static void Transform_Sha512(wc_Sha512* sha512) { const word64* K = K512; word32 j; @@ -222,7 +222,7 @@ static int Transform_Sha512(wc_Sha512* sha512) #undef DATA #define DATA ((word64*)data) -static int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +static void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) { const word64* K = K512; word32 j; @@ -324,16 +324,13 @@ static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 le } if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) { -#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) - ret = Transform_Sha512(sha512); +#ifndef WOLFSSL_ARMASM + Transform_Sha512(sha512); #else - ret = Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, WC_SHA512_BLOCK_SIZE); #endif - if (ret == 0) - sha512->buffLen = 0; - else - len = 0; + sha512->buffLen = 0; } } @@ -369,7 +366,6 @@ int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) static WC_INLINE int Sha512Final(wc_Sha512* sha512) { byte* local = (byte*)sha512->buffer; - int ret; if (sha512 == NULL) { return BAD_FUNC_ARG; @@ -382,14 +378,12 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512) XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE - sha512->buffLen); sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen; -#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) - ret = Transform_Sha512(sha512); +#ifndef WOLFSSL_ARMASM + Transform_Sha512(sha512); #else - ret = Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, WC_SHA512_BLOCK_SIZE); #endif - if (ret != 0) - return ret; sha512->buffLen = 0; } @@ -410,14 +404,12 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512) &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE); -#if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) - ret = Transform_Sha512(sha512); +#ifndef WOLFSSL_ARMASM + Transform_Sha512(sha512); #else - ret = Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, + Transform_Sha512_Len(sha512, (const byte*)sha512->buffer, WC_SHA512_BLOCK_SIZE); #endif - if (ret != 0) - return ret; #ifdef LITTLE_ENDIAN_ORDER ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE); diff --git a/wolfssl/wolfcrypt/sha512.h b/wolfssl/wolfcrypt/sha512.h index 958e7688b..586289a8d 100644 --- a/wolfssl/wolfcrypt/sha512.h +++ b/wolfssl/wolfcrypt/sha512.h @@ -141,9 +141,9 @@ typedef struct wc_Sha512 { #endif /* HAVE_FIPS */ -#if defined(WOLFSSL_ARMASM) && defined(__aarch64__) -WOLFSSL_LOCAL int Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, - word32 len); +#ifdef WOLFSSL_ARMASM +WOLFSSL_LOCAL void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, + word32 len); #endif #ifdef WOLFSSL_SHA512