From 7d67ffac69bfd13d0069bc6fa3089dea6537b2e6 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 24 Aug 2022 14:24:17 +1000 Subject: [PATCH] Fixup assembly to compile with ARMv7a --- configure.ac | 4 +- wolfcrypt/src/port/arm/armv8-32-curve25519.S | 2258 ++++++++--------- .../src/port/arm/armv8-32-curve25519_c.c | 1994 +++++++-------- wolfcrypt/src/port/arm/armv8-chacha.c | 13 +- wolfcrypt/src/port/arm/armv8-poly1305.c | 1 + wolfcrypt/src/port/arm/armv8-sha256.c | 4 + wolfcrypt/src/sha256.c | 1 + wolfcrypt/test/test.c | 14 +- 8 files changed, 2018 insertions(+), 2271 deletions(-) diff --git a/configure.ac b/configure.ac index c49e60fba..739617392 100644 --- a/configure.ac +++ b/configure.ac @@ -2081,11 +2081,11 @@ then AC_MSG_NOTICE([64bit ARMv8 found, setting mcpu to generic+crypto]) ;; armv7a) - AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon-vfpv3 -DWOLFSSL_ARMASM_NO_CRYPTO" + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon -DWOLFSSL_ARMASM_NO_CRYPTO -DWOLFSSL_ARM_ARCH=7" # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" ENABLED_ARMASM_CRYPTO=no - AC_MSG_NOTICE([32bit ARMv7-a found, setting mfpu to neon-vfpv3]) + AC_MSG_NOTICE([32bit ARMv7-a found, setting mfpu to neon]) ;; *) AM_CPPFLAGS="$AM_CPPFLAGS -mfpu=crypto-neon-fp-armv8" diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S index b9382c7c3..156116b3a 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -3927,59 +3927,47 @@ fe_ge_dbl: ldr r2, [sp, #56] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r1, [sp, #4] ldr r0, [sp, #12] bl fe_sq @@ -3989,188 +3977,164 @@ fe_ge_dbl: # Add-Sub # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r2] - ldr r6, [r2, #4] - adds r7, r3, r5 + ldr r5, [r1, #4] + ldrd r6, r7, [r2] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r1, #8] - ldr r4, [r1, #12] - ldr r5, [r2, #8] - ldr r6, [r2, #12] + ldr r5, [r1, #12] + ldrd r6, r7, [r2, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r2, #16] - ldr r6, [r2, #20] + ldr r5, [r1, #20] + ldrd r6, r7, [r2, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r1, #24] - ldr r4, [r1, #28] - ldr r5, [r2, #24] - ldr r6, [r2, #28] + ldr r5, [r1, #28] + ldrd r6, r7, [r2, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp] ldr r1, [sp, #12] ldr r2, [sp, #4] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r1, [sp, #60] ldr r0, [sp, #12] bl fe_sq2 @@ -4178,59 +4142,47 @@ fe_ge_dbl: ldr r1, [sp, #8] # Sub ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] - ldr r7, [r1] - ldr r8, [r1, #4] - ldr r9, [r1, #8] - ldr r10, [r1, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] + ldrd r8, r9, [r1] + ldrd r10, r11, [r1, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r0, #16] - ldr r4, [r0, #20] - ldr r5, [r0, #24] - ldr r6, [r0, #28] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r0, #20] + ldrd r6, r7, [r0, #24] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] add sp, sp, #16 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_dbl,.-fe_ge_dbl @@ -4250,117 +4202,93 @@ fe_ge_madd: ldr r2, [sp, #68] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp, #72] ldr r2, [sp, #68] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r2, [sp, #88] ldr r1, [sp] ldr r0, [sp, #8] @@ -4379,300 +4307,270 @@ fe_ge_madd: # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp, #8] ldr r1, [sp, #76] # Double ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] adds r3, r3, r3 - adcs r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 - adc r10, r10, r10 + adcs r10, r10, r10 + adc r11, r11, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #8] ldr r1, [sp, #12] # Add-Sub # Add ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r1] - ldr r6, [r1, #4] - adds r7, r3, r5 + ldr r5, [r0, #4] + ldrd r6, r7, [r1] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r0, #8] - ldr r4, [r0, #12] - ldr r5, [r1, #8] - ldr r6, [r1, #12] + ldr r5, [r0, #12] + ldrd r6, r7, [r1, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r0, #16] - ldr r4, [r0, #20] - ldr r5, [r1, #16] - ldr r6, [r1, #20] + ldr r5, [r0, #20] + ldrd r6, r7, [r1, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r0, #24] - ldr r4, [r0, #28] - ldr r5, [r1, #24] - ldr r6, [r1, #28] + ldr r5, [r0, #28] + ldrd r6, r7, [r1, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] add sp, sp, #32 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_madd,.-fe_ge_madd @@ -4692,117 +4590,93 @@ fe_ge_msub: ldr r2, [sp, #68] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp, #72] ldr r2, [sp, #68] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r2, [sp, #92] ldr r1, [sp] ldr r0, [sp, #8] @@ -4821,300 +4695,270 @@ fe_ge_msub: # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp, #8] ldr r1, [sp, #76] # Double ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] adds r3, r3, r3 - adcs r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 - adc r10, r10, r10 + adcs r10, r10, r10 + adc r11, r11, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #12] ldr r1, [sp, #8] # Add-Sub # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r1, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r1, #8] - ldr r4, [r1, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r1, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r1, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r1, #24] - ldr r4, [r1, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r1, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] add sp, sp, #32 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_msub,.-fe_ge_msub @@ -5134,117 +4978,93 @@ fe_ge_add: ldr r2, [sp, #132] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp, #136] ldr r2, [sp, #132] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r2, [sp, #156] ldr r1, [sp] ldr r0, [sp, #8] @@ -5265,303 +5085,273 @@ fe_ge_add: ldr r1, [sp] # Double ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] adds r3, r3, r3 - adcs r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 - adc r10, r10, r10 + adcs r10, r10, r10 + adc r11, r11, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp] ldr r2, [sp, #8] # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp, #8] ldr r1, [sp, #12] add r2, sp, #16 # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r1] - ldr r6, [r1, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r1] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r1, #8] - ldr r6, [r1, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r1, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r1, #16] - ldr r6, [r1, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r1, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r1, #24] - ldr r6, [r1, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r1, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] add sp, sp, #0x60 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_add,.-fe_ge_add @@ -5581,117 +5371,93 @@ fe_ge_sub: ldr r2, [sp, #132] # Add ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - adds r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + adds r8, r3, r8 adcs r9, r5, r9 adcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + adcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - adcs r7, r3, r7 - adcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + adcs r8, r3, r8 adcs r9, r5, r9 - adc r10, r6, r10 + adcs r10, r6, r10 + adc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp, #136] ldr r2, [sp, #132] # Sub ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r2] - ldr r8, [r2, #4] - ldr r9, [r2, #8] - ldr r10, [r2, #12] - subs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r2] + ldrd r10, r11, [r2, #8] + subs r8, r3, r8 sbcs r9, r5, r9 sbcs r10, r6, r10 - str r7, [r0] - str r8, [r0, #4] - str r9, [r0, #8] - str r10, [r0, #12] + sbcs r11, r7, r11 + strd r8, r9, [r0] + strd r10, r11, [r0, #8] ldr r3, [r1, #16] - ldr r4, [r1, #20] - ldr r5, [r1, #24] - ldr r6, [r1, #28] - ldr r7, [r2, #16] - ldr r8, [r2, #20] - ldr r9, [r2, #24] - ldr r10, [r2, #28] - sbcs r7, r3, r7 - sbcs r8, r4, r8 + ldr r5, [r1, #20] + ldrd r6, r7, [r1, #24] + ldrd r8, r9, [r2, #16] + ldrd r10, r11, [r2, #24] + sbcs r8, r3, r8 sbcs r9, r5, r9 - sbc r10, r6, r10 + sbcs r10, r6, r10 + sbc r11, r7, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r0] - ldr r4, [r0, #4] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r0, #4] + ldrd r6, r7, [r0, #8] adds r3, r3, r12 - adcs r4, r4, r11 - adcs r5, r5, r11 - adcs r6, r6, r11 - adcs r7, r7, r11 - adcs r8, r8, r11 - adcs r9, r9, r11 - adc r10, r10, lr + adcs r5, r5, r4 + adcs r6, r6, r4 + adcs r7, r7, r4 + adcs r8, r8, r4 + adcs r9, r9, r4 + adcs r10, r10, r4 + adc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r2, [sp, #160] ldr r1, [sp] ldr r0, [sp, #8] @@ -5712,303 +5478,273 @@ fe_ge_sub: ldr r1, [sp] # Double ldr r3, [r1] - ldr r4, [r1, #4] - ldr r5, [r1, #8] - ldr r6, [r1, #12] - ldr r7, [r1, #16] - ldr r8, [r1, #20] - ldr r9, [r1, #24] - ldr r10, [r1, #28] + ldr r5, [r1, #4] + ldrd r6, r7, [r1, #8] + ldrd r8, r9, [r1, #16] + ldrd r10, r11, [r1, #24] adds r3, r3, r3 - adcs r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 - adc r10, r10, r10 + adcs r10, r10, r10 + adc r11, r11, r11 mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) subs r3, r3, r12 - sbcs r4, r4, r11 - sbcs r5, r5, r11 - sbcs r6, r6, r11 - sbcs r7, r7, r11 - sbcs r8, r8, r11 - sbcs r9, r9, r11 - sbc r10, r10, lr + sbcs r5, r5, r4 + sbcs r6, r6, r4 + sbcs r7, r7, r4 + sbcs r8, r8, r4 + sbcs r9, r9, r4 + sbcs r10, r10, r4 + sbc r11, r11, lr str r3, [r0] - str r4, [r0, #4] - str r5, [r0, #8] - str r6, [r0, #12] - str r7, [r0, #16] - str r8, [r0, #20] - str r9, [r0, #24] - str r10, [r0, #28] + str r5, [r0, #4] + strd r6, r7, [r0, #8] + strd r8, r9, [r0, #16] + strd r10, r11, [r0, #24] ldr r0, [sp, #4] ldr r1, [sp] ldr r2, [sp, #8] # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] ldr r0, [sp, #12] ldr r1, [sp, #8] add r2, sp, #16 # Add-Sub # Add ldr r3, [r2] - ldr r4, [r2, #4] - ldr r5, [r0] - ldr r6, [r0, #4] - adds r7, r3, r5 + ldr r5, [r2, #4] + ldrd r6, r7, [r0] + adds r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0] - str r8, [r0, #4] + strd r8, r9, [r0] # Sub - subs r9, r3, r5 + subs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1] - str r10, [r1, #4] + strd r10, r11, [r1] # Add ldr r3, [r2, #8] - ldr r4, [r2, #12] - ldr r5, [r0, #8] - ldr r6, [r0, #12] + ldr r5, [r2, #12] + ldrd r6, r7, [r0, #8] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #8] - str r8, [r0, #12] + strd r8, r9, [r0, #8] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #8] - str r10, [r1, #12] + strd r10, r11, [r1, #8] # Add ldr r3, [r2, #16] - ldr r4, [r2, #20] - ldr r5, [r0, #16] - ldr r6, [r0, #20] + ldr r5, [r2, #20] + ldrd r6, r7, [r0, #16] adds r12, r12, #-1 - adcs r7, r3, r5 + adcs r8, r3, r6 mov r12, #0 - adcs r8, r4, r6 + adcs r9, r5, r7 adc r12, r12, #0 - str r7, [r0, #16] - str r8, [r0, #20] + strd r8, r9, [r0, #16] # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 + sbcs r10, r3, r6 mov lr, #0 - sbcs r10, r4, r6 + sbcs r11, r5, r7 adc lr, lr, #0 - str r9, [r1, #16] - str r10, [r1, #20] + strd r10, r11, [r1, #16] # Add ldr r3, [r2, #24] - ldr r4, [r2, #28] - ldr r5, [r0, #24] - ldr r6, [r0, #28] + ldr r5, [r2, #28] + ldrd r6, r7, [r0, #24] adds r12, r12, #-1 - adcs r7, r3, r5 - adc r8, r4, r6 + adcs r8, r3, r6 + adc r9, r5, r7 # Sub adds lr, lr, #-1 - sbcs r9, r3, r5 - sbc r10, r4, r6 + sbcs r10, r3, r6 + sbc r11, r5, r7 mov r12, #-19 - asr r11, r8, #31 + asr r4, r9, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Sub modulus (if overflow) ldr r3, [r0] - ldr r4, [r0, #4] + ldr r5, [r0, #4] subs r3, r3, r12 - sbcs r4, r4, r11 + sbcs r5, r5, r4 str r3, [r0] - str r4, [r0, #4] + str r5, [r0, #4] ldr r3, [r0, #8] - ldr r4, [r0, #12] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #12] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #8] - str r4, [r0, #12] + str r5, [r0, #12] ldr r3, [r0, #16] - ldr r4, [r0, #20] - sbcs r3, r3, r11 - sbcs r4, r4, r11 + ldr r5, [r0, #20] + sbcs r3, r3, r4 + sbcs r5, r5, r4 str r3, [r0, #16] - str r4, [r0, #20] - sbcs r7, r7, r11 - sbc r8, r8, lr - str r7, [r0, #24] - str r8, [r0, #28] + str r5, [r0, #20] + sbcs r8, r8, r4 + sbc r9, r9, lr + strd r8, r9, [r0, #24] mov r12, #-19 - asr r11, r10, #31 + asr r4, r11, #31 # Mask the modulus - and r12, r11, r12 - and lr, r11, #0x7fffffff + and r12, r4, r12 + and lr, r4, #0x7fffffff # Add modulus (if underflow) ldr r3, [r1] - ldr r4, [r1, #4] + ldr r5, [r1, #4] adds r3, r3, r12 - adcs r4, r4, r11 + adcs r5, r5, r4 str r3, [r1] - str r4, [r1, #4] + str r5, [r1, #4] ldr r3, [r1, #8] - ldr r4, [r1, #12] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #12] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #8] - str r4, [r1, #12] + str r5, [r1, #12] ldr r3, [r1, #16] - ldr r4, [r1, #20] - adcs r3, r3, r11 - adcs r4, r4, r11 + ldr r5, [r1, #20] + adcs r3, r3, r4 + adcs r5, r5, r4 str r3, [r1, #16] - str r4, [r1, #20] - adcs r9, r9, r11 - adc r10, r10, lr - str r9, [r1, #24] - str r10, [r1, #28] + str r5, [r1, #20] + adcs r10, r10, r4 + adc r11, r11, lr + strd r10, r11, [r1, #24] add sp, sp, #0x60 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_ge_sub,.-fe_ge_sub diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 3967f1836..7a5be1771 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -3936,44 +3936,44 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr r1, [sp, #52]\n\t" "ldr r2, [sp, #56]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r1, [sp, #4]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_sq\n\t" @@ -3982,189 +3982,189 @@ void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr r2, [sp]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r2]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r2]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #8]\n\t" - "ldrd r5, r6, [r2, #8]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "ldrd r6, r7, [r2, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r2, #16]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r2, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #24]\n\t" - "ldrd r5, r6, [r2, #24]\n\t" + "ldrd %[rt], r5, [r1, #24]\n\t" + "ldrd r6, r7, [r2, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp]\n\t" "ldr r1, [sp, #12]\n\t" "ldr r2, [sp, #4]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r1, [sp, #60]\n\t" "ldr r0, [sp, #12]\n\t" "bl fe_sq2\n\t" "ldr r0, [sp, #12]\n\t" "ldr r1, [sp, #8]\n\t" /* Sub */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" - "ldrd r7, r8, [r1]\n\t" - "ldrd r9, r10, [r1, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" + "ldrd r8, r9, [r1]\n\t" + "ldrd r10, r11, [r1, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "add sp, sp, #16\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : @@ -4187,86 +4187,86 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr r1, [sp, #72]\n\t" "ldr r2, [sp, #68]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp, #72]\n\t" "ldr r2, [sp, #68]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r2, [sp, #88]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" @@ -4284,237 +4284,237 @@ void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr r2, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp, #8]\n\t" "ldr r1, [sp, #76]\n\t" /* Double */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" "adds %[rt], %[rt], %[rt]\n\t" - "adcs r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adcs r7, r7, r7\n\t" "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" - "adc r10, r10, r10\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #8]\n\t" "ldr r1, [sp, #12]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r1]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r1]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r0, #8]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r0, #16]\n\t" - "ldrd r5, r6, [r1, #16]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "ldrd r6, r7, [r1, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r0, #24]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" + "ldrd %[rt], r5, [r0, #24]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "add sp, sp, #32\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : @@ -4541,86 +4541,86 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr r1, [sp, #72]\n\t" "ldr r2, [sp, #68]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp, #72]\n\t" "ldr r2, [sp, #68]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r2, [sp, #92]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" @@ -4638,237 +4638,237 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "ldr r2, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp, #8]\n\t" "ldr r1, [sp, #76]\n\t" /* Double */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" "adds %[rt], %[rt], %[rt]\n\t" - "adcs r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adcs r7, r7, r7\n\t" "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" - "adc r10, r10, r10\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #12]\n\t" "ldr r1, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r1, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r1, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "add sp, sp, #32\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : @@ -4895,86 +4895,86 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr r1, [sp, #136]\n\t" "ldr r2, [sp, #132]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp, #136]\n\t" "ldr r2, [sp, #132]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r2, [sp, #156]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" @@ -4994,240 +4994,240 @@ void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "add r0, sp, #16\n\t" "ldr r1, [sp]\n\t" /* Double */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" "adds %[rt], %[rt], %[rt]\n\t" - "adcs r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adcs r7, r7, r7\n\t" "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" - "adc r10, r10, r10\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp]\n\t" "ldr r2, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp, #8]\n\t" "ldr r1, [sp, #12]\n\t" "add r2, sp, #16\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r1]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r1]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r1, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r1, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "add sp, sp, #0x60\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : @@ -5255,86 +5255,86 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "ldr r1, [sp, #136]\n\t" "ldr r2, [sp, #132]\n\t" /* Add */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "adds r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "adds r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" "adcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "adcs r7, %[rt], r7\n\t" - "adcs r8, r4, r8\n\t" + "adcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "adcs r8, %[rt], r8\n\t" "adcs r9, r5, r9\n\t" - "adc r10, r6, r10\n\t" + "adcs r10, r6, r10\n\t" + "adc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp, #136]\n\t" "ldr r2, [sp, #132]\n\t" /* Sub */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r2]\n\t" - "ldrd r9, r10, [r2, #8]\n\t" - "subs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r2]\n\t" + "ldrd r10, r11, [r2, #8]\n\t" + "subs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" "sbcs r10, r6, r10\n\t" - "strd r7, r8, [r0]\n\t" - "strd r9, r10, [r0, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "ldrd r5, r6, [r1, #24]\n\t" - "ldrd r7, r8, [r2, #16]\n\t" - "ldrd r9, r10, [r2, #24]\n\t" - "sbcs r7, %[rt], r7\n\t" - "sbcs r8, r4, r8\n\t" + "sbcs r11, r7, r11\n\t" + "strd r8, r9, [r0]\n\t" + "strd r10, r11, [r0, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "ldrd r6, r7, [r1, #24]\n\t" + "ldrd r8, r9, [r2, #16]\n\t" + "ldrd r10, r11, [r2, #24]\n\t" + "sbcs r8, %[rt], r8\n\t" "sbcs r9, r5, r9\n\t" - "sbc r10, r6, r10\n\t" + "sbcs r10, r6, r10\n\t" + "sbc r11, r7, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r0]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "adcs r5, r5, r4\n\t" + "adcs r6, r6, r4\n\t" + "adcs r7, r7, r4\n\t" + "adcs r8, r8, r4\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r2, [sp, #160]\n\t" "ldr r1, [sp]\n\t" "ldr r0, [sp, #8]\n\t" @@ -5354,240 +5354,240 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "add r0, sp, #16\n\t" "ldr r1, [sp]\n\t" /* Double */ - "ldrd %[rt], r4, [r1]\n\t" - "ldrd r5, r6, [r1, #8]\n\t" - "ldrd r7, r8, [r1, #16]\n\t" - "ldrd r9, r10, [r1, #24]\n\t" + "ldrd %[rt], r5, [r1]\n\t" + "ldrd r6, r7, [r1, #8]\n\t" + "ldrd r8, r9, [r1, #16]\n\t" + "ldrd r10, r11, [r1, #24]\n\t" "adds %[rt], %[rt], %[rt]\n\t" - "adcs r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adcs r7, r7, r7\n\t" "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" - "adc r10, r10, r10\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r7, r7, r11\n\t" - "sbcs r8, r8, r11\n\t" - "sbcs r9, r9, r11\n\t" - "sbc r10, r10, lr\n\t" - "strd %[rt], r4, [r0]\n\t" - "strd r5, r6, [r0, #8]\n\t" - "strd r7, r8, [r0, #16]\n\t" - "strd r9, r10, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "sbcs r6, r6, r4\n\t" + "sbcs r7, r7, r4\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r4\n\t" + "sbcs r10, r10, r4\n\t" + "sbc r11, r11, lr\n\t" + "strd %[rt], r5, [r0]\n\t" + "strd r6, r7, [r0, #8]\n\t" + "strd r8, r9, [r0, #16]\n\t" + "strd r10, r11, [r0, #24]\n\t" "ldr r0, [sp, #4]\n\t" "ldr r1, [sp]\n\t" "ldr r2, [sp, #8]\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "ldr r0, [sp, #12]\n\t" "ldr r1, [sp, #8]\n\t" "add r2, sp, #16\n\t" /* Add-Sub */ /* Add */ - "ldrd %[rt], r4, [r2]\n\t" - "ldrd r5, r6, [r0]\n\t" - "adds r7, %[rt], r5\n\t" + "ldrd %[rt], r5, [r2]\n\t" + "ldrd r6, r7, [r0]\n\t" + "adds r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0]\n\t" + "strd r8, r9, [r0]\n\t" /* Sub */ - "subs r9, %[rt], r5\n\t" + "subs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1]\n\t" + "strd r10, r11, [r1]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #8]\n\t" - "ldrd r5, r6, [r0, #8]\n\t" + "ldrd %[rt], r5, [r2, #8]\n\t" + "ldrd r6, r7, [r0, #8]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #8]\n\t" + "strd r8, r9, [r0, #8]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #8]\n\t" + "strd r10, r11, [r1, #8]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #16]\n\t" - "ldrd r5, r6, [r0, #16]\n\t" + "ldrd %[rt], r5, [r2, #16]\n\t" + "ldrd r6, r7, [r0, #16]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" + "adcs r8, %[rt], r6\n\t" "mov r12, #0\n\t" - "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" - "strd r7, r8, [r0, #16]\n\t" + "strd r8, r9, [r0, #16]\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" + "sbcs r10, %[rt], r6\n\t" "mov lr, #0\n\t" - "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" - "strd r9, r10, [r1, #16]\n\t" + "strd r10, r11, [r1, #16]\n\t" /* Add */ - "ldrd %[rt], r4, [r2, #24]\n\t" - "ldrd r5, r6, [r0, #24]\n\t" + "ldrd %[rt], r5, [r2, #24]\n\t" + "ldrd r6, r7, [r0, #24]\n\t" "adds r12, r12, #-1\n\t" - "adcs r7, %[rt], r5\n\t" - "adc r8, r4, r6\n\t" + "adcs r8, %[rt], r6\n\t" + "adc r9, r5, r7\n\t" /* Sub */ "adds lr, lr, #-1\n\t" - "sbcs r9, %[rt], r5\n\t" - "sbc r10, r4, r6\n\t" + "sbcs r10, %[rt], r6\n\t" + "sbc r11, r5, r7\n\t" "mov r12, #-19\n\t" - "asr r11, r8, #31\n\t" + "asr r4, r9, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Sub modulus (if overflow) */ - "ldrd %[rt], r4, [r0]\n\t" + "ldrd %[rt], r5, [r0]\n\t" "subs %[rt], %[rt], r12\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0]\n\t" - "ldrd %[rt], r4, [r0, #8]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #8]\n\t" - "ldrd %[rt], r4, [r0, #16]\n\t" - "sbcs %[rt], %[rt], r11\n\t" - "sbcs r4, r4, r11\n\t" - "strd %[rt], r4, [r0, #16]\n\t" - "sbcs r7, r7, r11\n\t" - "sbc r8, r8, lr\n\t" - "strd r7, r8, [r0, #24]\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0]\n\t" + "ldrd %[rt], r5, [r0, #8]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #8]\n\t" + "ldrd %[rt], r5, [r0, #16]\n\t" + "sbcs %[rt], %[rt], r4\n\t" + "sbcs r5, r5, r4\n\t" + "strd %[rt], r5, [r0, #16]\n\t" + "sbcs r8, r8, r4\n\t" + "sbc r9, r9, lr\n\t" + "strd r8, r9, [r0, #24]\n\t" "mov r12, #-19\n\t" - "asr r11, r10, #31\n\t" + "asr r4, r11, #31\n\t" /* Mask the modulus */ - "and r12, r11, r12\n\t" - "and lr, r11, #0x7fffffff\n\t" + "and r12, r4, r12\n\t" + "and lr, r4, #0x7fffffff\n\t" /* Add modulus (if underflow) */ - "ldrd %[rt], r4, [r1]\n\t" + "ldrd %[rt], r5, [r1]\n\t" "adds %[rt], %[rt], r12\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1]\n\t" - "ldrd %[rt], r4, [r1, #8]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #8]\n\t" - "ldrd %[rt], r4, [r1, #16]\n\t" - "adcs %[rt], %[rt], r11\n\t" - "adcs r4, r4, r11\n\t" - "strd %[rt], r4, [r1, #16]\n\t" - "adcs r9, r9, r11\n\t" - "adc r10, r10, lr\n\t" - "strd r9, r10, [r1, #24]\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1]\n\t" + "ldrd %[rt], r5, [r1, #8]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #8]\n\t" + "ldrd %[rt], r5, [r1, #16]\n\t" + "adcs %[rt], %[rt], r4\n\t" + "adcs r5, r5, r4\n\t" + "strd %[rt], r5, [r1, #16]\n\t" + "adcs r10, r10, r4\n\t" + "adc r11, r11, lr\n\t" + "strd r10, r11, [r1, #24]\n\t" "add sp, sp, #0x60\n\t" : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) : diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index 7b0bd1a6c..83d242671 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -987,7 +987,12 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "VMOV d4, r8, r9 \n\t" "STRD r10, r11, %[x_10] \n\t" "VMOV d5, r10, r11 \n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 8) + "LDR r11, [r14, #4*14] \n\t" + "LDR r10, [r14, #4*15] \n\t" +#else "LDRD r11, r10, [r14, #4*14] \n\t" +#endif "VMOV q4, q0 \n\t" "VMOV q5, q1 \n\t" "VMOV q6, q2 \n\t" @@ -2754,11 +2759,9 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, /* XOR 8 bytes */ "CMP %[bytes], #8 \n\t" "BLT L_chacha20_arm32_64_lt_8_%= \n\t" - "VLDR d8, [%[m], #0] \n\t" - "ADD %[m], %[m], #8 \n\t" + "VLD1.64 { d8 }, [%[m]]! \n\t" "VEOR d8, d8, d0 \n\t" - "VSTR d8, [%[c], #0] \n\t" - "ADD %[c], %[c], #8 \n\t" + "VST1.64 { d8 }, [%[c]]! \n\t" "SUBS %[bytes], %[bytes], #8 \n\t" "VMOV d0, d1 \n\t" "BEQ L_chacha20_arm32_64_done_%= \n\t" @@ -2772,7 +2775,7 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, "EOR r12, r12, r14 \n\t" "STR r12, [%[c]], #4 \n\t" "SUBS %[bytes], %[bytes], #4 \n\t" - "VTRN.32 d0, d0 \n\t" + "VSHR.U64 d0, d0, #32 \n\t" "BEQ L_chacha20_arm32_64_done_%= \n\t" "\n" "L_chacha20_arm32_64_lt_4_%=: \n\t" diff --git a/wolfcrypt/src/port/arm/armv8-poly1305.c b/wolfcrypt/src/port/arm/armv8-poly1305.c index 637599827..5ed722bcb 100644 --- a/wolfcrypt/src/port/arm/armv8-poly1305.c +++ b/wolfcrypt/src/port/arm/armv8-poly1305.c @@ -29,6 +29,7 @@ #endif #include +#include #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index 4109dd19f..730f5c599 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -1537,7 +1537,11 @@ int wc_Sha256Transform(wc_Sha256* sha256, const unsigned char* data) #else XMEMCPY(sha256->buffer, data, WC_SHA256_BLOCK_SIZE); #endif +#ifndef WOLFSSL_ARMASM_NO_CRYPTO Sha256Transform(sha256, data, 1); +#else + Transform_Sha256_Len(sha256, data, WC_SHA256_BLOCK_SIZE); +#endif return 0; } #endif diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index dab3c97e3..fda36af7b 100644 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -43,6 +43,7 @@ on the specific device platform. #endif #include +#include /* * SHA256 Build Options: diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 4867a741d..927865c88 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -5921,8 +5921,10 @@ WOLFSSL_TEST_SUBROUTINE int chacha_test(void) return -4722; for (i = 0; i < 18; ++i) { - /* this will test all paths */ - /* block sizes: 1 2 3 4 7 8 15 16 31 32 63 64 127 128 255 256 511 512 */ + /* this will test all paths + * block sizes: 1 3 7 15 31 63 127 255 511 (i = 0- 8) + * 2 4 8 16 32 64 128 256 512 (i = 9-17) + */ block_size = (2 << (i%9)) - (i<9?1:0); keySz = 32; @@ -5936,16 +5938,16 @@ WOLFSSL_TEST_SUBROUTINE int chacha_test(void) if (ret != 0) return ret; - ret |= wc_Chacha_Process(&enc, cipher_big, plain_big, block_size); - ret |= wc_Chacha_Process(&dec, plain_big, cipher_big, block_size); + ret |= wc_Chacha_Process(&enc, cipher_big, plain_big , block_size); + ret |= wc_Chacha_Process(&dec, plain_big , cipher_big, block_size); if (ret != 0) return ret; if (XMEMCMP(plain_big, input_big, block_size)) - return -4723-i; + return -4740-i*2; if (XMEMCMP(cipher_big, cipher_big_result, block_size)) - return -4724-i; + return -4741-i*2; } /* Streaming test */